LCOV - code coverage report
Current view: top level - libreoffice/sal/textenc - tcvtutf8.cxx (source / functions) Hit Total Coverage
Test: libreoffice_filtered.info Lines: 161 203 79.3 %
Date: 2012-12-27 Functions: 8 8 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2             : /*
       3             :  * This file is part of the LibreOffice project.
       4             :  *
       5             :  * This Source Code Form is subject to the terms of the Mozilla Public
       6             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       7             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       8             :  *
       9             :  * This file incorporates work covered by the following license notice:
      10             :  *
      11             :  *   Licensed to the Apache Software Foundation (ASF) under one or more
      12             :  *   contributor license agreements. See the NOTICE file distributed
      13             :  *   with this work for additional information regarding copyright
      14             :  *   ownership. The ASF licenses this file to you under the Apache
      15             :  *   License, Version 2.0 (the "License"); you may not use this file
      16             :  *   except in compliance with the License. You may obtain a copy of
      17             :  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
      18             :  */
      19             : 
      20             : #include "sal/config.h"
      21             : 
      22             : #include "sal/types.h"
      23             : #include "rtl/textcvt.h"
      24             : 
      25             : #include "converter.hxx"
      26             : #include "tcvtutf8.hxx"
      27             : #include "tenchelp.hxx"
      28             : #include "unichars.hxx"
      29             : 
      30             : struct ImplUtf8ToUnicodeContext
      31             : {
      32             :     sal_uInt32 nUtf32;
      33             :     int nShift;
      34             :     bool bCheckBom;
      35             : };
      36             : 
      37             : struct ImplUnicodeToUtf8Context
      38             : {
      39             :     sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
      40             : };
      41             : 
      42          26 : void * ImplCreateUtf8ToUnicodeContext()
      43             : {
      44          26 :     ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
      45          26 :     ImplResetUtf8ToUnicodeContext(p);
      46          26 :     return p;
      47             : }
      48             : 
      49          26 : void ImplResetUtf8ToUnicodeContext(void * pContext)
      50             : {
      51          26 :     if (pContext != NULL)
      52             :     {
      53          26 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
      54          26 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
      55             :     }
      56          26 : }
      57             : 
      58          26 : void ImplDestroyUtf8ToUnicodeContext(void * pContext)
      59             : {
      60          26 :     delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
      61          26 : }
      62             : 
      63     1110678 : sal_Size ImplConvertUtf8ToUnicode(
      64             :     void const * pData, void * pContext, char const * pSrcBuf,
      65             :     sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
      66             :     sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
      67             : {
      68             :     /*
      69             :        This function is very liberal with the UTF-8 input.  Accepted are:
      70             :        - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
      71             :        - surrogates (e.g., ED A0 80 to represent U+D800)
      72             :        - encodings with up to six bytes (everything outside the range
      73             :          U+0000..10FFFF is considered "undefined")
      74             :        The first two of these points allow this routine to translate from both
      75             :        RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
      76             :       */
      77             : 
      78     1110678 :     int bJavaUtf8 = pData != NULL;
      79     1110678 :     sal_uInt32 nUtf32 = 0;
      80     1110678 :     int nShift = -1;
      81     1110678 :     bool bCheckBom = true;
      82     1110678 :     sal_uInt32 nInfo = 0;
      83     1110678 :     sal_uChar const * pSrcBufPtr = (sal_uChar const *) pSrcBuf;
      84     1110678 :     sal_uChar const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
      85     1110678 :     sal_Unicode * pDestBufPtr = pDestBuf;
      86     1110678 :     sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
      87             : 
      88     1110678 :     if (pContext != NULL)
      89             :     {
      90         101 :         nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
      91         101 :         nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
      92         101 :         bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
      93             :     }
      94             : 
      95    23940574 :     while (pSrcBufPtr < pSrcBufEnd)
      96             :     {
      97    21719274 :         bool bUndefined = false;
      98    21719274 :         int bConsume = true;
      99    21719274 :         sal_uInt32 nChar = *pSrcBufPtr++;
     100    21719274 :         if (nShift < 0)
     101    17646677 :             if (nChar <= 0x7F)
     102             :             {
     103    15593777 :                 nUtf32 = nChar;
     104    15593777 :                 goto transform;
     105             :             }
     106     2052900 :             else if (nChar <= 0xBF)
     107          95 :                 goto bad_input;
     108     2052805 :             else if (nChar <= 0xDF)
     109             :             {
     110       33036 :                 nUtf32 = (nChar & 0x1F) << 6;
     111       33036 :                 nShift = 0;
     112             :             }
     113     2019769 :             else if (nChar <= 0xEF)
     114             :             {
     115     2019705 :                 nUtf32 = (nChar & 0x0F) << 12;
     116     2019705 :                 nShift = 6;
     117             :             }
     118          64 :             else if (nChar <= 0xF7)
     119             :             {
     120          27 :                 nUtf32 = (nChar & 0x07) << 18;
     121          27 :                 nShift = 12;
     122             :             }
     123          37 :             else if (nChar <= 0xFB)
     124             :             {
     125          15 :                 nUtf32 = (nChar & 0x03) << 24;
     126          15 :                 nShift = 18;
     127             :             }
     128          22 :             else if (nChar <= 0xFD)
     129             :             {
     130          16 :                 nUtf32 = (nChar & 0x01) << 30;
     131          16 :                 nShift = 24;
     132             :             }
     133             :             else
     134           6 :                 goto bad_input;
     135     4072597 :         else if ((nChar & 0xC0) == 0x80)
     136             :         {
     137     4071385 :             nUtf32 |= (nChar & 0x3F) << nShift;
     138     4071385 :             if (nShift == 0)
     139     2051584 :                 goto transform;
     140             :             else
     141     2019801 :                 nShift -= 6;
     142             :         }
     143             :         else
     144             :         {
     145             :             /*
     146             :              This byte is preceeded by a broken UTF-8 sequence; if this byte
     147             :              is neither in the range [0x80..0xBF] nor in the range
     148             :              [0xFE..0xFF], assume that this byte does not belong to that
     149             :              broken sequence, but instead starts a new, legal UTF-8 sequence:
     150             :              */
     151        1212 :             bConsume = nChar >= 0xFE;
     152        1212 :             goto bad_input;
     153             :         }
     154     4072600 :         continue;
     155             : 
     156             :     transform:
     157    17645361 :         if (!bCheckBom || nUtf32 != 0xFEFF
     158             :             || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
     159             :             || bJavaUtf8)
     160             :         {
     161    17645357 :             if (nUtf32 <= 0xFFFF)
     162    17645344 :                 if (pDestBufPtr != pDestBufEnd)
     163    17645344 :                     *pDestBufPtr++ = (sal_Unicode) nUtf32;
     164             :                 else
     165           0 :                     goto no_output;
     166          13 :             else if (nUtf32 <= 0x10FFFF)
     167           5 :                 if (pDestBufEnd - pDestBufPtr >= 2)
     168             :                 {
     169           5 :                     *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
     170           5 :                     *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
     171             :                 }
     172             :                 else
     173           0 :                     goto no_output;
     174             :             else
     175             :             {
     176           8 :                 bUndefined = true;
     177           8 :                 goto bad_input;
     178             :             }
     179             :         }
     180    17645353 :         nShift = -1;
     181    17645353 :         bCheckBom = false;
     182    17645353 :         continue;
     183             : 
     184             :     bad_input:
     185        1321 :         switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
     186             :                     bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
     187        1321 :                     &nInfo))
     188             :         {
     189             :         case sal::detail::textenc::BAD_INPUT_STOP:
     190           0 :             nShift = -1;
     191           0 :             bCheckBom = false;
     192           0 :             if (!bConsume)
     193           0 :                 --pSrcBufPtr;
     194           0 :             break;
     195             : 
     196             :         case sal::detail::textenc::BAD_INPUT_CONTINUE:
     197        1265 :             nShift = -1;
     198        1265 :             bCheckBom = false;
     199        1265 :             if (!bConsume)
     200        1156 :                 --pSrcBufPtr;
     201        1265 :             continue;
     202             : 
     203             :         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     204          56 :             goto no_output;
     205             :         }
     206           0 :         break;
     207             : 
     208             :     no_output:
     209          56 :         --pSrcBufPtr;
     210          56 :         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
     211          56 :         break;
     212             :     }
     213             : 
     214     1110678 :     if (nShift >= 0
     215             :         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
     216             :                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
     217             :                == 0)
     218             :     {
     219          47 :         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
     220          47 :             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
     221             :         else
     222           0 :             switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
     223             :                         false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
     224           0 :                         &nInfo))
     225             :             {
     226             :             case sal::detail::textenc::BAD_INPUT_STOP:
     227             :             case sal::detail::textenc::BAD_INPUT_CONTINUE:
     228           0 :                 nShift = -1;
     229           0 :                 bCheckBom = false;
     230           0 :                 break;
     231             : 
     232             :             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     233           0 :                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
     234           0 :                 break;
     235             :             }
     236             :     }
     237             : 
     238     1110678 :     if (pContext != NULL)
     239             :     {
     240         101 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
     241         101 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
     242         101 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
     243             :     }
     244     1110678 :     if (pInfo != NULL)
     245     1110678 :         *pInfo = nInfo;
     246     1110678 :     if (pSrcCvtBytes != NULL)
     247     1110678 :         *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
     248     1110678 :     return pDestBufPtr - pDestBuf;
     249             : }
     250             : 
     251           8 : void * ImplCreateUnicodeToUtf8Context()
     252             : {
     253           8 :     ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
     254           8 :     ImplResetUnicodeToUtf8Context(p);
     255           8 :     return p;
     256             : }
     257             : 
     258           8 : void ImplResetUnicodeToUtf8Context(void * pContext)
     259             : {
     260           8 :     if (pContext != NULL)
     261           8 :         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
     262           8 : }
     263             : 
     264           8 : void ImplDestroyUnicodeToUtf8Context(void * pContext)
     265             : {
     266           8 :     delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
     267           8 : }
     268             : 
     269      467124 : sal_Size ImplConvertUnicodeToUtf8(
     270             :     void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
     271             :     sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
     272             :     sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
     273             : {
     274      467124 :     int bJavaUtf8 = pData != NULL;
     275      467124 :     sal_Unicode nHighSurrogate = 0xFFFF;
     276      467124 :     sal_uInt32 nInfo = 0;
     277      467124 :     sal_Unicode const * pSrcBufPtr = pSrcBuf;
     278      467124 :     sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
     279      467124 :     char * pDestBufPtr = pDestBuf;
     280      467124 :     char * pDestBufEnd = pDestBufPtr + nDestBytes;
     281             : 
     282      467124 :     if (pContext != NULL)
     283             :         nHighSurrogate
     284           8 :             = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
     285             : 
     286      467124 :     if (nHighSurrogate == 0xFFFF)
     287             :     {
     288      467124 :         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
     289             :             && !bJavaUtf8)
     290             :         {
     291           2 :             if (pDestBufEnd - pDestBufPtr >= 3)
     292             :             {
     293             :                 /* Write BOM (U+FEFF) as UTF-8: */
     294           2 :                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
     295           2 :                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
     296           2 :                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
     297             :             }
     298             :             else
     299             :             {
     300           0 :                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
     301           0 :                 goto done;
     302             :             }
     303             :         }
     304      467124 :         nHighSurrogate = 0;
     305             :     }
     306             : 
     307    27679571 :     while (pSrcBufPtr < pSrcBufEnd)
     308             :     {
     309    26745323 :         sal_uInt32 nChar = *pSrcBufPtr++;
     310    26745323 :         if (nHighSurrogate == 0)
     311             :         {
     312    26745322 :             if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
     313             :             {
     314           1 :                 nHighSurrogate = (sal_Unicode) nChar;
     315           1 :                 continue;
     316             :             }
     317             :         }
     318           1 :         else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
     319           1 :             nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
     320             :         else
     321           0 :             goto bad_input;
     322             : 
     323    53490644 :         if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
     324    26745322 :             || ImplIsNoncharacter(nChar))
     325           0 :             goto bad_input;
     326             : 
     327    26745322 :         if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
     328    53483624 :             if (pDestBufPtr != pDestBufEnd)
     329    26741812 :                 *pDestBufPtr++ = static_cast< char >(nChar);
     330             :             else
     331           0 :                 goto no_output;
     332        3510 :         else if (nChar <= 0x7FF)
     333         957 :             if (pDestBufEnd - pDestBufPtr >= 2)
     334             :             {
     335         957 :                 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
     336         957 :                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
     337             :             }
     338             :             else
     339           0 :                 goto no_output;
     340        2553 :         else if (nChar <= 0xFFFF)
     341        2552 :             if (pDestBufEnd - pDestBufPtr >= 3)
     342             :             {
     343        2552 :                 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
     344        2552 :                 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
     345        2552 :                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
     346             :             }
     347             :             else
     348           0 :                 goto no_output;
     349           1 :         else if (pDestBufEnd - pDestBufPtr >= 4)
     350             :         {
     351           1 :             *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
     352           1 :             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
     353           1 :             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
     354           1 :             *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
     355             :         }
     356             :         else
     357           0 :             goto no_output;
     358    26745322 :         nHighSurrogate = 0;
     359    26745322 :         continue;
     360             : 
     361             :     bad_input:
     362           0 :         switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
     363             :                     false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, NULL,
     364           0 :                     0, NULL))
     365             :         {
     366             :         case sal::detail::textenc::BAD_INPUT_STOP:
     367           0 :             nHighSurrogate = 0;
     368           0 :             break;
     369             : 
     370             :         case sal::detail::textenc::BAD_INPUT_CONTINUE:
     371           0 :             nHighSurrogate = 0;
     372           0 :             continue;
     373             : 
     374             :         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     375           0 :             goto no_output;
     376             :         }
     377           0 :         break;
     378             : 
     379             :     no_output:
     380           0 :         --pSrcBufPtr;
     381           0 :         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
     382           0 :         break;
     383             :     }
     384             : 
     385      467124 :     if (nHighSurrogate != 0
     386             :         && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
     387             :                          | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
     388             :                == 0)
     389             :     {
     390           0 :         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
     391           0 :             nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
     392             :         else
     393           0 :             switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
     394             :                         false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
     395           0 :                         NULL, 0, NULL))
     396             :             {
     397             :             case sal::detail::textenc::BAD_INPUT_STOP:
     398             :             case sal::detail::textenc::BAD_INPUT_CONTINUE:
     399           0 :                 nHighSurrogate = 0;
     400           0 :                 break;
     401             : 
     402             :             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     403           0 :                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
     404           0 :                 break;
     405             :             }
     406             :     }
     407             : 
     408             :  done:
     409      467124 :     if (pContext != NULL)
     410             :         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
     411           8 :             = nHighSurrogate;
     412      467124 :     if (pInfo != NULL)
     413      467124 :         *pInfo = nInfo;
     414      467124 :     if (pSrcCvtChars != NULL)
     415      467124 :         *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
     416      467124 :     return pDestBufPtr - pDestBuf;
     417             : }
     418             : 
     419             : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Generated by: LCOV version 1.10