LCOV - code coverage report
Current view: top level - sal/textenc - tcvtutf8.cxx (source / functions) Hit Total Coverage
Test: commit e02a6cb2c3e2b23b203b422e4e0680877f232636 Lines: 76 208 36.5 %
Date: 2014-04-14 Functions: 2 8 25.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2             : /*
       3             :  * This file is part of the LibreOffice project.
       4             :  *
       5             :  * This Source Code Form is subject to the terms of the Mozilla Public
       6             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       7             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       8             :  *
       9             :  * This file incorporates work covered by the following license notice:
      10             :  *
      11             :  *   Licensed to the Apache Software Foundation (ASF) under one or more
      12             :  *   contributor license agreements. See the NOTICE file distributed
      13             :  *   with this work for additional information regarding copyright
      14             :  *   ownership. The ASF licenses this file to you under the Apache
      15             :  *   License, Version 2.0 (the "License"); you may not use this file
      16             :  *   except in compliance with the License. You may obtain a copy of
      17             :  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
      18             :  */
      19             : 
      20             : #include "sal/config.h"
      21             : 
      22             : #include "sal/types.h"
      23             : #include "rtl/textcvt.h"
      24             : 
      25             : #include "converter.hxx"
      26             : #include "tcvtutf8.hxx"
      27             : #include "tenchelp.hxx"
      28             : #include "unichars.hxx"
      29             : 
      30             : struct ImplUtf8ToUnicodeContext
      31             : {
      32             :     sal_uInt32 nUtf32;
      33             :     int nShift;
      34             :     bool bCheckBom;
      35             : };
      36             : 
      37             : struct ImplUnicodeToUtf8Context
      38             : {
      39             :     sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
      40             : };
      41             : 
      42           0 : void * ImplCreateUtf8ToUnicodeContext()
      43             : {
      44           0 :     ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
      45           0 :     ImplResetUtf8ToUnicodeContext(p);
      46           0 :     return p;
      47             : }
      48             : 
      49           0 : void ImplResetUtf8ToUnicodeContext(void * pContext)
      50             : {
      51           0 :     if (pContext != NULL)
      52             :     {
      53           0 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
      54           0 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
      55             :     }
      56           0 : }
      57             : 
      58           0 : void ImplDestroyUtf8ToUnicodeContext(void * pContext)
      59             : {
      60           0 :     delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
      61           0 : }
      62             : 
      63     1147854 : sal_Size ImplConvertUtf8ToUnicode(
      64             :     void const * pData, void * pContext, char const * pSrcBuf,
      65             :     sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
      66             :     sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
      67             : {
      68             :     /*
      69             :        This function is very liberal with the UTF-8 input.  Accepted are:
      70             :        - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
      71             :        - surrogates (e.g., ED A0 80 to represent U+D800)
      72             :        - encodings with up to six bytes (everything outside the range
      73             :          U+0000..10FFFF is considered "undefined")
      74             :        The first two of these points allow this routine to translate from both
      75             :        RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
      76             :       */
      77             : 
      78     1147854 :     bool bJavaUtf8 = pData != NULL;
      79     1147854 :     sal_uInt32 nUtf32 = 0;
      80     1147854 :     int nShift = -1;
      81     1147854 :     bool bCheckBom = true;
      82     1147854 :     sal_uInt32 nInfo = 0;
      83     1147854 :     unsigned char const * pSrcBufPtr = (unsigned char const *) pSrcBuf;
      84     1147854 :     unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
      85     1147854 :     sal_Unicode * pDestBufPtr = pDestBuf;
      86     1147854 :     sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
      87             : 
      88     1147854 :     if (pContext != NULL)
      89             :     {
      90           0 :         nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
      91           0 :         nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
      92           0 :         bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
      93             :     }
      94             : 
      95    31185620 :     while (pSrcBufPtr < pSrcBufEnd)
      96             :     {
      97    28889912 :         bool bUndefined = false;
      98    28889912 :         bool bConsume = true;
      99    28889912 :         sal_uInt32 nChar = *pSrcBufPtr++;
     100    28889912 :         if (nShift < 0)
     101    28885959 :             if (nChar <= 0x7F)
     102             :             {
     103    28883978 :                 nUtf32 = nChar;
     104    28883978 :                 goto transform;
     105             :             }
     106        1981 :             else if (nChar <= 0xBF)
     107           0 :                 goto bad_input;
     108        1981 :             else if (nChar <= 0xDF)
     109             :             {
     110           9 :                 nUtf32 = (nChar & 0x1F) << 6;
     111           9 :                 nShift = 0;
     112             :             }
     113        1972 :             else if (nChar <= 0xEF)
     114             :             {
     115        1972 :                 nUtf32 = (nChar & 0x0F) << 12;
     116        1972 :                 nShift = 6;
     117             :             }
     118           0 :             else if (nChar <= 0xF7)
     119             :             {
     120           0 :                 nUtf32 = (nChar & 0x07) << 18;
     121           0 :                 nShift = 12;
     122             :             }
     123           0 :             else if (nChar <= 0xFB)
     124             :             {
     125           0 :                 nUtf32 = (nChar & 0x03) << 24;
     126           0 :                 nShift = 18;
     127             :             }
     128           0 :             else if (nChar <= 0xFD)
     129             :             {
     130           0 :                 nUtf32 = (nChar & 0x01) << 30;
     131           0 :                 nShift = 24;
     132             :             }
     133             :             else
     134           0 :                 goto bad_input;
     135        3953 :         else if ((nChar & 0xC0) == 0x80)
     136             :         {
     137        3953 :             nUtf32 |= (nChar & 0x3F) << nShift;
     138        3953 :             if (nShift == 0)
     139        1981 :                 goto transform;
     140             :             else
     141        1972 :                 nShift -= 6;
     142             :         }
     143             :         else
     144             :         {
     145             :             /*
     146             :              This byte is preceded by a broken UTF-8 sequence; if this byte
     147             :              is neither in the range [0x80..0xBF] nor in the range
     148             :              [0xFE..0xFF], assume that this byte does not belong to that
     149             :              broken sequence, but instead starts a new, legal UTF-8 sequence:
     150             :              */
     151           0 :             bConsume = nChar >= 0xFE;
     152           0 :             goto bad_input;
     153             :         }
     154        3953 :         continue;
     155             : 
     156             :     transform:
     157    28885959 :         if (!bCheckBom || nUtf32 != 0xFEFF
     158           0 :             || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
     159           0 :             || bJavaUtf8)
     160             :         {
     161    28885959 :             if (nUtf32 <= 0xFFFF)
     162    28885959 :                 if (pDestBufPtr != pDestBufEnd)
     163    28885959 :                     *pDestBufPtr++ = (sal_Unicode) nUtf32;
     164             :                 else
     165           0 :                     goto no_output;
     166           0 :             else if (nUtf32 <= 0x10FFFF)
     167           0 :                 if (pDestBufEnd - pDestBufPtr >= 2)
     168             :                 {
     169           0 :                     *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
     170           0 :                     *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
     171             :                 }
     172             :                 else
     173           0 :                     goto no_output;
     174             :             else
     175             :             {
     176           0 :                 bUndefined = true;
     177           0 :                 goto bad_input;
     178             :             }
     179             :         }
     180    28885959 :         nShift = -1;
     181    28885959 :         bCheckBom = false;
     182    28885959 :         continue;
     183             : 
     184             :     bad_input:
     185           0 :         switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
     186             :                     bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
     187           0 :                     &nInfo))
     188             :         {
     189             :         case sal::detail::textenc::BAD_INPUT_STOP:
     190           0 :             nShift = -1;
     191           0 :             bCheckBom = false;
     192           0 :             if (!bConsume)
     193           0 :                 --pSrcBufPtr;
     194           0 :             break;
     195             : 
     196             :         case sal::detail::textenc::BAD_INPUT_CONTINUE:
     197           0 :             nShift = -1;
     198           0 :             bCheckBom = false;
     199           0 :             if (!bConsume)
     200           0 :                 --pSrcBufPtr;
     201           0 :             continue;
     202             : 
     203             :         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     204           0 :             goto no_output;
     205             :         }
     206           0 :         break;
     207             : 
     208             :     no_output:
     209           0 :         --pSrcBufPtr;
     210           0 :         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
     211           0 :         break;
     212             :     }
     213             : 
     214     1147854 :     if (nShift >= 0
     215           0 :         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
     216             :                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
     217             :                == 0)
     218             :     {
     219           0 :         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
     220           0 :             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
     221             :         else
     222           0 :             switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
     223             :                         false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
     224           0 :                         &nInfo))
     225             :             {
     226             :             case sal::detail::textenc::BAD_INPUT_STOP:
     227             :             case sal::detail::textenc::BAD_INPUT_CONTINUE:
     228           0 :                 nShift = -1;
     229           0 :                 bCheckBom = false;
     230           0 :                 break;
     231             : 
     232             :             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     233           0 :                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
     234           0 :                 break;
     235             :             }
     236             :     }
     237             : 
     238     1147854 :     if (pContext != NULL)
     239             :     {
     240           0 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
     241           0 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
     242           0 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
     243             :     }
     244     1147854 :     if (pInfo != NULL)
     245     1147854 :         *pInfo = nInfo;
     246     1147854 :     if (pSrcCvtBytes != NULL)
     247     1147854 :         *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
     248     1147854 :     return pDestBufPtr - pDestBuf;
     249             : }
     250             : 
     251           0 : void * ImplCreateUnicodeToUtf8Context()
     252             : {
     253           0 :     ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
     254           0 :     ImplResetUnicodeToUtf8Context(p);
     255           0 :     return p;
     256             : }
     257             : 
     258           0 : void ImplResetUnicodeToUtf8Context(void * pContext)
     259             : {
     260           0 :     if (pContext != NULL)
     261           0 :         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
     262           0 : }
     263             : 
     264           0 : void ImplDestroyUnicodeToUtf8Context(void * pContext)
     265             : {
     266           0 :     delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
     267           0 : }
     268             : 
     269     1530378 : sal_Size ImplConvertUnicodeToUtf8(
     270             :     void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
     271             :     sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
     272             :     sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
     273             : {
     274     1530378 :     bool bJavaUtf8 = pData != NULL;
     275     1530378 :     sal_Unicode nHighSurrogate = 0xFFFF;
     276     1530378 :     sal_uInt32 nInfo = 0;
     277     1530378 :     sal_Unicode const * pSrcBufPtr = pSrcBuf;
     278     1530378 :     sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
     279     1530378 :     char * pDestBufPtr = pDestBuf;
     280     1530378 :     char * pDestBufEnd = pDestBufPtr + nDestBytes;
     281             : 
     282     1530378 :     if (pContext != NULL)
     283             :         nHighSurrogate
     284           0 :             = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
     285             : 
     286     1530378 :     if (nHighSurrogate == 0xFFFF)
     287             :     {
     288     1530378 :         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
     289           0 :             && !bJavaUtf8)
     290             :         {
     291           0 :             if (pDestBufEnd - pDestBufPtr >= 3)
     292             :             {
     293             :                 /* Write BOM (U+FEFF) as UTF-8: */
     294           0 :                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
     295           0 :                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
     296           0 :                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
     297             :             }
     298             :             else
     299             :             {
     300           0 :                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
     301           0 :                 goto done;
     302             :             }
     303             :         }
     304     1530378 :         nHighSurrogate = 0;
     305             :     }
     306             : 
     307    53434650 :     while (pSrcBufPtr < pSrcBufEnd)
     308             :     {
     309    50373894 :         sal_uInt32 nChar = *pSrcBufPtr++;
     310    50373894 :         if (nHighSurrogate == 0)
     311             :         {
     312    50373894 :             if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
     313             :             {
     314           0 :                 nHighSurrogate = (sal_Unicode) nChar;
     315           0 :                 continue;
     316             :             }
     317             :         }
     318           0 :         else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
     319           0 :             nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
     320             :         else
     321           0 :             goto bad_input;
     322             : 
     323   100747788 :         if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
     324   100747788 :             || ImplIsNoncharacter(nChar))
     325           0 :             goto bad_input;
     326             : 
     327    50373894 :         if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
     328   100747788 :             if (pDestBufPtr != pDestBufEnd)
     329    50373894 :                 *pDestBufPtr++ = static_cast< char >(nChar);
     330             :             else
     331           0 :                 goto no_output;
     332           0 :         else if (nChar <= 0x7FF)
     333           0 :             if (pDestBufEnd - pDestBufPtr >= 2)
     334             :             {
     335           0 :                 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
     336           0 :                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
     337             :             }
     338             :             else
     339           0 :                 goto no_output;
     340           0 :         else if (nChar <= 0xFFFF)
     341           0 :             if (pDestBufEnd - pDestBufPtr >= 3)
     342             :             {
     343           0 :                 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
     344           0 :                 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
     345           0 :                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
     346             :             }
     347             :             else
     348           0 :                 goto no_output;
     349           0 :         else if (pDestBufEnd - pDestBufPtr >= 4)
     350             :         {
     351           0 :             *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
     352           0 :             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
     353           0 :             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
     354           0 :             *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
     355             :         }
     356             :         else
     357           0 :             goto no_output;
     358    50373894 :         nHighSurrogate = 0;
     359    50373894 :         continue;
     360             : 
     361             :     bad_input:
     362           0 :         switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
     363             :                     false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, NULL,
     364           0 :                     0, NULL))
     365             :         {
     366             :         case sal::detail::textenc::BAD_INPUT_STOP:
     367           0 :             nHighSurrogate = 0;
     368           0 :             break;
     369             : 
     370             :         case sal::detail::textenc::BAD_INPUT_CONTINUE:
     371           0 :             nHighSurrogate = 0;
     372           0 :             continue;
     373             : 
     374             :         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     375           0 :             goto no_output;
     376             :         }
     377           0 :         break;
     378             : 
     379             :     no_output:
     380           0 :         --pSrcBufPtr;
     381           0 :         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
     382           0 :         break;
     383             :     }
     384             : 
     385     1530378 :     if (nHighSurrogate != 0
     386           0 :         && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
     387             :                          | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
     388             :                == 0)
     389             :     {
     390           0 :         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
     391           0 :             nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
     392             :         else
     393           0 :             switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
     394             :                         false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
     395           0 :                         NULL, 0, NULL))
     396             :             {
     397             :             case sal::detail::textenc::BAD_INPUT_STOP:
     398             :             case sal::detail::textenc::BAD_INPUT_CONTINUE:
     399           0 :                 nHighSurrogate = 0;
     400           0 :                 break;
     401             : 
     402             :             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     403           0 :                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
     404           0 :                 break;
     405             :             }
     406             :     }
     407             : 
     408             :  done:
     409     1530378 :     if (pContext != NULL)
     410             :         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
     411           0 :             = nHighSurrogate;
     412     1530378 :     if (pInfo != NULL)
     413     1530378 :         *pInfo = nInfo;
     414     1530378 :     if (pSrcCvtChars != NULL)
     415     1530378 :         *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
     416     1530378 :     return pDestBufPtr - pDestBuf;
     417             : }
     418             : 
     419             : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Generated by: LCOV version 1.10