LCOV - code coverage report
Current view: top level - sal/textenc - tcvtutf8.cxx (source / functions) Hit Total Coverage
Test: commit c8344322a7af75b84dd3ca8f78b05543a976dfd5 Lines: 182 208 87.5 %
Date: 2015-06-13 12:38:46 Functions: 8 8 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2             : /*
       3             :  * This file is part of the LibreOffice project.
       4             :  *
       5             :  * This Source Code Form is subject to the terms of the Mozilla Public
       6             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       7             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       8             :  *
       9             :  * This file incorporates work covered by the following license notice:
      10             :  *
      11             :  *   Licensed to the Apache Software Foundation (ASF) under one or more
      12             :  *   contributor license agreements. See the NOTICE file distributed
      13             :  *   with this work for additional information regarding copyright
      14             :  *   ownership. The ASF licenses this file to you under the Apache
      15             :  *   License, Version 2.0 (the "License"); you may not use this file
      16             :  *   except in compliance with the License. You may obtain a copy of
      17             :  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
      18             :  */
      19             : 
      20             : #include "sal/config.h"
      21             : 
      22             : #include "sal/types.h"
      23             : #include "rtl/textcvt.h"
      24             : 
      25             : #include "converter.hxx"
      26             : #include "tcvtutf8.hxx"
      27             : #include "tenchelp.hxx"
      28             : #include "unichars.hxx"
      29             : 
      30             : struct ImplUtf8ToUnicodeContext
      31             : {
      32             :     sal_uInt32 nUtf32;
      33             :     int nShift;
      34             :     bool bCheckBom;
      35             : };
      36             : 
      37             : struct ImplUnicodeToUtf8Context
      38             : {
      39             :     sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
      40             : };
      41             : 
      42          59 : void * ImplCreateUtf8ToUnicodeContext()
      43             : {
      44          59 :     ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
      45          59 :     ImplResetUtf8ToUnicodeContext(p);
      46          59 :     return p;
      47             : }
      48             : 
      49          71 : void ImplResetUtf8ToUnicodeContext(void * pContext)
      50             : {
      51          71 :     if (pContext != NULL)
      52             :     {
      53          71 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
      54          71 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
      55             :     }
      56          71 : }
      57             : 
      58          58 : void ImplDestroyUtf8ToUnicodeContext(void * pContext)
      59             : {
      60          58 :     delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
      61          58 : }
      62             : 
      63     1293427 : sal_Size ImplConvertUtf8ToUnicode(
      64             :     void const * pData, void * pContext, char const * pSrcBuf,
      65             :     sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
      66             :     sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
      67             : {
      68             :     /*
      69             :        This function is very liberal with the UTF-8 input.  Accepted are:
      70             :        - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
      71             :        - surrogates (e.g., ED A0 80 to represent U+D800)
      72             :        - encodings with up to six bytes (everything outside the range
      73             :          U+0000..10FFFF is considered "undefined")
      74             :        The first two of these points allow this routine to translate from both
      75             :        RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
      76             :       */
      77             : 
      78     1293427 :     bool bJavaUtf8 = pData != NULL;
      79     1293427 :     sal_uInt32 nUtf32 = 0;
      80     1293427 :     int nShift = -1;
      81     1293427 :     bool bCheckBom = true;
      82     1293427 :     sal_uInt32 nInfo = 0;
      83     1293427 :     unsigned char const * pSrcBufPtr = reinterpret_cast<unsigned char const *>(pSrcBuf);
      84     1293427 :     unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
      85     1293427 :     sal_Unicode * pDestBufPtr = pDestBuf;
      86     1293427 :     sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
      87             : 
      88     1293427 :     if (pContext != NULL)
      89             :     {
      90      109825 :         nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
      91      109825 :         nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
      92      109825 :         bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
      93             :     }
      94             : 
      95    29716962 :     while (pSrcBufPtr < pSrcBufEnd)
      96             :     {
      97    27130196 :         bool bUndefined = false;
      98    27130196 :         bool bConsume = true;
      99    27130196 :         sal_uInt32 nChar = *pSrcBufPtr++;
     100    27130196 :         if (nShift < 0)
     101    21903158 :             if (nChar <= 0x7F)
     102             :             {
     103    19249171 :                 nUtf32 = nChar;
     104    19249171 :                 goto transform;
     105             :             }
     106     2653987 :             else if (nChar <= 0xBF)
     107       12870 :                 goto bad_input;
     108     2641117 :             else if (nChar <= 0xDF)
     109             :             {
     110       43973 :                 nUtf32 = (nChar & 0x1F) << 6;
     111       43973 :                 nShift = 0;
     112             :             }
     113     2597144 :             else if (nChar <= 0xEF)
     114             :             {
     115     2591773 :                 nUtf32 = (nChar & 0x0F) << 12;
     116     2591773 :                 nShift = 6;
     117             :             }
     118        5371 :             else if (nChar <= 0xF7)
     119             :             {
     120         788 :                 nUtf32 = (nChar & 0x07) << 18;
     121         788 :                 nShift = 12;
     122             :             }
     123        4583 :             else if (nChar <= 0xFB)
     124             :             {
     125         753 :                 nUtf32 = (nChar & 0x03) << 24;
     126         753 :                 nShift = 18;
     127             :             }
     128        3830 :             else if (nChar <= 0xFD)
     129             :             {
     130         184 :                 nUtf32 = (nChar & 0x01) << 30;
     131         184 :                 nShift = 24;
     132             :             }
     133             :             else
     134        3646 :                 goto bad_input;
     135     5227038 :         else if ((nChar & 0xC0) == 0x80)
     136             :         {
     137     5216715 :             nUtf32 |= (nChar & 0x3F) << nShift;
     138     5216715 :             if (nShift == 0)
     139     2627144 :                 goto transform;
     140             :             else
     141     2589571 :                 nShift -= 6;
     142             :         }
     143             :         else
     144             :         {
     145             :             /*
     146             :              This byte is preceded by a broken UTF-8 sequence; if this byte
     147             :              is neither in the range [0x80..0xBF] nor in the range
     148             :              [0xFE..0xFF], assume that this byte does not belong to that
     149             :              broken sequence, but instead starts a new, legal UTF-8 sequence:
     150             :              */
     151       10323 :             bConsume = nChar >= 0xFE;
     152       10323 :             goto bad_input;
     153             :         }
     154     5227042 :         continue;
     155             : 
     156             :     transform:
     157    21876315 :         if (!bCheckBom || nUtf32 != 0xFEFF
     158          10 :             || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
     159           6 :             || bJavaUtf8)
     160             :         {
     161    21876311 :             if (nUtf32 <= 0xFFFF)
     162    21876288 :                 if (pDestBufPtr != pDestBufEnd)
     163    21876287 :                     *pDestBufPtr++ = (sal_Unicode) nUtf32;
     164             :                 else
     165           1 :                     goto no_output;
     166          23 :             else if (nUtf32 <= 0x10FFFF)
     167          15 :                 if (pDestBufEnd - pDestBufPtr >= 2)
     168             :                 {
     169          15 :                     *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
     170          15 :                     *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
     171             :                 }
     172             :                 else
     173           0 :                     goto no_output;
     174             :             else
     175             :             {
     176           8 :                 bUndefined = true;
     177           8 :                 goto bad_input;
     178             :             }
     179             :         }
     180    21876306 :         nShift = -1;
     181    21876306 :         bCheckBom = false;
     182    21876306 :         continue;
     183             : 
     184             :     bad_input:
     185       26847 :         switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
     186             :                     bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
     187       26847 :                     &nInfo))
     188             :         {
     189             :         case sal::detail::textenc::BAD_INPUT_STOP:
     190          31 :             nShift = -1;
     191          31 :             bCheckBom = false;
     192          31 :             if (!bConsume)
     193           2 :                 --pSrcBufPtr;
     194          31 :             break;
     195             : 
     196             :         case sal::detail::textenc::BAD_INPUT_CONTINUE:
     197       26760 :             nShift = -1;
     198       26760 :             bCheckBom = false;
     199       26760 :             if (!bConsume)
     200        9470 :                 --pSrcBufPtr;
     201       26760 :             continue;
     202             : 
     203             :         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     204          56 :             goto no_output;
     205             :         }
     206          31 :         break;
     207             : 
     208             :     no_output:
     209          57 :         --pSrcBufPtr;
     210          57 :         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
     211          57 :         break;
     212             :     }
     213             : 
     214     1293427 :     if (nShift >= 0
     215         106 :         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
     216             :                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
     217             :                == 0)
     218             :     {
     219          50 :         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
     220          49 :             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
     221             :         else
     222           1 :             switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
     223             :                         false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
     224           1 :                         &nInfo))
     225             :             {
     226             :             case sal::detail::textenc::BAD_INPUT_STOP:
     227             :             case sal::detail::textenc::BAD_INPUT_CONTINUE:
     228           1 :                 nShift = -1;
     229           1 :                 bCheckBom = false;
     230           1 :                 break;
     231             : 
     232             :             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     233           0 :                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
     234           0 :                 break;
     235             :             }
     236             :     }
     237             : 
     238     1293427 :     if (pContext != NULL)
     239             :     {
     240      109825 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
     241      109825 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
     242      109825 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
     243             :     }
     244     1293427 :     if (pInfo != NULL)
     245     1293427 :         *pInfo = nInfo;
     246     1293427 :     if (pSrcCvtBytes != NULL)
     247     1293427 :         *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
     248     1293427 :     return pDestBufPtr - pDestBuf;
     249             : }
     250             : 
     251        2332 : void * ImplCreateUnicodeToUtf8Context()
     252             : {
     253        2332 :     ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
     254        2332 :     ImplResetUnicodeToUtf8Context(p);
     255        2332 :     return p;
     256             : }
     257             : 
     258        2332 : void ImplResetUnicodeToUtf8Context(void * pContext)
     259             : {
     260        2332 :     if (pContext != NULL)
     261        2332 :         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
     262        2332 : }
     263             : 
     264        2333 : void ImplDestroyUnicodeToUtf8Context(void * pContext)
     265             : {
     266        2333 :     delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
     267        2333 : }
     268             : 
     269      876120 : sal_Size ImplConvertUnicodeToUtf8(
     270             :     void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
     271             :     sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
     272             :     sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
     273             : {
     274      876120 :     bool bJavaUtf8 = pData != NULL;
     275      876120 :     sal_Unicode nHighSurrogate = 0xFFFF;
     276      876120 :     sal_uInt32 nInfo = 0;
     277      876120 :     sal_Unicode const * pSrcBufPtr = pSrcBuf;
     278      876120 :     sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
     279      876120 :     char * pDestBufPtr = pDestBuf;
     280      876120 :     char * pDestBufEnd = pDestBufPtr + nDestBytes;
     281             : 
     282      876120 :     if (pContext != NULL)
     283             :         nHighSurrogate
     284       19855 :             = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
     285             : 
     286      876120 :     if (nHighSurrogate == 0xFFFF)
     287             :     {
     288      858597 :         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
     289           3 :             && !bJavaUtf8)
     290             :         {
     291           2 :             if (pDestBufEnd - pDestBufPtr >= 3)
     292             :             {
     293             :                 /* Write BOM (U+FEFF) as UTF-8: */
     294           2 :                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
     295           2 :                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
     296           2 :                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
     297             :             }
     298             :             else
     299             :             {
     300           0 :                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
     301           0 :                 goto done;
     302             :             }
     303             :         }
     304      858597 :         nHighSurrogate = 0;
     305             :     }
     306             : 
     307    37654615 :     while (pSrcBufPtr < pSrcBufEnd)
     308             :     {
     309    35902375 :         sal_uInt32 nChar = *pSrcBufPtr++;
     310    35902375 :         if (nHighSurrogate == 0)
     311             :         {
     312    35902374 :             if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
     313             :             {
     314           1 :                 nHighSurrogate = (sal_Unicode) nChar;
     315           1 :                 continue;
     316             :             }
     317             :         }
     318           1 :         else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
     319           1 :             nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
     320             :         else
     321           0 :             goto bad_input;
     322             : 
     323    71804749 :         if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
     324    71804748 :             || ImplIsNoncharacter(nChar))
     325           2 :             goto bad_input;
     326             : 
     327    35902372 :         if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
     328    71668336 :             if (pDestBufPtr != pDestBufEnd)
     329    35834168 :                 *pDestBufPtr++ = static_cast< char >(nChar);
     330             :             else
     331           0 :                 goto no_output;
     332       68204 :         else if (nChar <= 0x7FF)
     333        4977 :             if (pDestBufEnd - pDestBufPtr >= 2)
     334             :             {
     335        4977 :                 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
     336        4977 :                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
     337             :             }
     338             :             else
     339           0 :                 goto no_output;
     340       63227 :         else if (nChar <= 0xFFFF)
     341       63226 :             if (pDestBufEnd - pDestBufPtr >= 3)
     342             :             {
     343       63226 :                 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
     344       63226 :                 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
     345       63226 :                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
     346             :             }
     347             :             else
     348           0 :                 goto no_output;
     349           1 :         else if (pDestBufEnd - pDestBufPtr >= 4)
     350             :         {
     351           1 :             *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
     352           1 :             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
     353           1 :             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
     354           1 :             *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
     355             :         }
     356             :         else
     357           0 :             goto no_output;
     358    35902372 :         nHighSurrogate = 0;
     359    35902372 :         continue;
     360             : 
     361             :     bad_input:
     362           2 :         switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
     363             :                     false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, NULL,
     364           2 :                     0, NULL))
     365             :         {
     366             :         case sal::detail::textenc::BAD_INPUT_STOP:
     367           0 :             nHighSurrogate = 0;
     368           0 :             break;
     369             : 
     370             :         case sal::detail::textenc::BAD_INPUT_CONTINUE:
     371           2 :             nHighSurrogate = 0;
     372           2 :             continue;
     373             : 
     374             :         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     375           0 :             goto no_output;
     376             :         }
     377           0 :         break;
     378             : 
     379             :     no_output:
     380           0 :         --pSrcBufPtr;
     381           0 :         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
     382           0 :         break;
     383             :     }
     384             : 
     385      876120 :     if (nHighSurrogate != 0
     386           0 :         && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
     387             :                          | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
     388             :                == 0)
     389             :     {
     390           0 :         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
     391           0 :             nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
     392             :         else
     393           0 :             switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
     394             :                         false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
     395           0 :                         NULL, 0, NULL))
     396             :             {
     397             :             case sal::detail::textenc::BAD_INPUT_STOP:
     398             :             case sal::detail::textenc::BAD_INPUT_CONTINUE:
     399           0 :                 nHighSurrogate = 0;
     400           0 :                 break;
     401             : 
     402             :             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     403           0 :                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
     404           0 :                 break;
     405             :             }
     406             :     }
     407             : 
     408             :  done:
     409      876120 :     if (pContext != NULL)
     410             :         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
     411       19855 :             = nHighSurrogate;
     412      876120 :     if (pInfo != NULL)
     413      876120 :         *pInfo = nInfo;
     414      876120 :     if (pSrcCvtChars != NULL)
     415      876120 :         *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
     416      876120 :     return pDestBufPtr - pDestBuf;
     417             : }
     418             : 
     419             : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Generated by: LCOV version 1.11