LCOV - code coverage report
Current view: top level - usr/local/src/libreoffice/sal/textenc - tcvtutf8.cxx (source / functions) Hit Total Coverage
Test: libreoffice_filtered.info Lines: 165 208 79.3 %
Date: 2013-07-09 Functions: 8 8 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2             : /*
       3             :  * This file is part of the LibreOffice project.
       4             :  *
       5             :  * This Source Code Form is subject to the terms of the Mozilla Public
       6             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       7             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       8             :  *
       9             :  * This file incorporates work covered by the following license notice:
      10             :  *
      11             :  *   Licensed to the Apache Software Foundation (ASF) under one or more
      12             :  *   contributor license agreements. See the NOTICE file distributed
      13             :  *   with this work for additional information regarding copyright
      14             :  *   ownership. The ASF licenses this file to you under the Apache
      15             :  *   License, Version 2.0 (the "License"); you may not use this file
      16             :  *   except in compliance with the License. You may obtain a copy of
      17             :  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
      18             :  */
      19             : 
      20             : #include "sal/config.h"
      21             : 
      22             : #include "sal/types.h"
      23             : #include "rtl/textcvt.h"
      24             : 
      25             : #include "converter.hxx"
      26             : #include "tcvtutf8.hxx"
      27             : #include "tenchelp.hxx"
      28             : #include "unichars.hxx"
      29             : 
      30             : struct ImplUtf8ToUnicodeContext
      31             : {
      32             :     sal_uInt32 nUtf32;
      33             :     int nShift;
      34             :     bool bCheckBom;
      35             : };
      36             : 
      37             : struct ImplUnicodeToUtf8Context
      38             : {
      39             :     sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
      40             : };
      41             : 
      42         157 : void * ImplCreateUtf8ToUnicodeContext()
      43             : {
      44         157 :     ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
      45         157 :     ImplResetUtf8ToUnicodeContext(p);
      46         157 :     return p;
      47             : }
      48             : 
      49         157 : void ImplResetUtf8ToUnicodeContext(void * pContext)
      50             : {
      51         157 :     if (pContext != NULL)
      52             :     {
      53         157 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
      54         157 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
      55             :     }
      56         157 : }
      57             : 
      58         157 : void ImplDestroyUtf8ToUnicodeContext(void * pContext)
      59             : {
      60         157 :     delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
      61         157 : }
      62             : 
      63     1216645 : sal_Size ImplConvertUtf8ToUnicode(
      64             :     void const * pData, void * pContext, char const * pSrcBuf,
      65             :     sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
      66             :     sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
      67             : {
      68             :     /*
      69             :        This function is very liberal with the UTF-8 input.  Accepted are:
      70             :        - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
      71             :        - surrogates (e.g., ED A0 80 to represent U+D800)
      72             :        - encodings with up to six bytes (everything outside the range
      73             :          U+0000..10FFFF is considered "undefined")
      74             :        The first two of these points allow this routine to translate from both
      75             :        RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
      76             :       */
      77             : 
      78     1216645 :     int bJavaUtf8 = pData != NULL;
      79     1216645 :     sal_uInt32 nUtf32 = 0;
      80     1216645 :     int nShift = -1;
      81     1216645 :     bool bCheckBom = true;
      82     1216645 :     sal_uInt32 nInfo = 0;
      83     1216645 :     sal_uChar const * pSrcBufPtr = (sal_uChar const *) pSrcBuf;
      84     1216645 :     sal_uChar const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
      85     1216645 :     sal_Unicode * pDestBufPtr = pDestBuf;
      86     1216645 :     sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
      87             : 
      88     1216645 :     if (pContext != NULL)
      89             :     {
      90         629 :         nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
      91         629 :         nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
      92         629 :         bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
      93             :     }
      94             : 
      95    31431436 :     while (pSrcBufPtr < pSrcBufEnd)
      96             :     {
      97    28998202 :         bool bUndefined = false;
      98    28998202 :         int bConsume = true;
      99    28998202 :         sal_uInt32 nChar = *pSrcBufPtr++;
     100    28998202 :         if (nShift < 0)
     101    24451269 :             if (nChar <= 0x7F)
     102             :             {
     103    22167472 :                 nUtf32 = nChar;
     104    22167472 :                 goto transform;
     105             :             }
     106     2283797 :             else if (nChar <= 0xBF)
     107          95 :                 goto bad_input;
     108     2283702 :             else if (nChar <= 0xDF)
     109             :             {
     110       20497 :                 nUtf32 = (nChar & 0x1F) << 6;
     111       20497 :                 nShift = 0;
     112             :             }
     113     2263205 :             else if (nChar <= 0xEF)
     114             :             {
     115     2263144 :                 nUtf32 = (nChar & 0x0F) << 12;
     116     2263144 :                 nShift = 6;
     117             :             }
     118          61 :             else if (nChar <= 0xF7)
     119             :             {
     120          27 :                 nUtf32 = (nChar & 0x07) << 18;
     121          27 :                 nShift = 12;
     122             :             }
     123          34 :             else if (nChar <= 0xFB)
     124             :             {
     125          15 :                 nUtf32 = (nChar & 0x03) << 24;
     126          15 :                 nShift = 18;
     127             :             }
     128          19 :             else if (nChar <= 0xFD)
     129             :             {
     130          13 :                 nUtf32 = (nChar & 0x01) << 30;
     131          13 :                 nShift = 24;
     132             :             }
     133             :             else
     134           6 :                 goto bad_input;
     135     4546933 :         else if ((nChar & 0xC0) == 0x80)
     136             :         {
     137     4545724 :             nUtf32 |= (nChar & 0x3F) << nShift;
     138     4545724 :             if (nShift == 0)
     139     2282484 :                 goto transform;
     140             :             else
     141     2263240 :                 nShift -= 6;
     142             :         }
     143             :         else
     144             :         {
     145             :             /*
     146             :              This byte is preceded by a broken UTF-8 sequence; if this byte
     147             :              is neither in the range [0x80..0xBF] nor in the range
     148             :              [0xFE..0xFF], assume that this byte does not belong to that
     149             :              broken sequence, but instead starts a new, legal UTF-8 sequence:
     150             :              */
     151        1209 :             bConsume = nChar >= 0xFE;
     152        1209 :             goto bad_input;
     153             :         }
     154     4546936 :         continue;
     155             : 
     156             :     transform:
     157    24449956 :         if (!bCheckBom || nUtf32 != 0xFEFF
     158          10 :             || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
     159           6 :             || bJavaUtf8)
     160             :         {
     161    24449952 :             if (nUtf32 <= 0xFFFF)
     162    24449939 :                 if (pDestBufPtr != pDestBufEnd)
     163    24449939 :                     *pDestBufPtr++ = (sal_Unicode) nUtf32;
     164             :                 else
     165           0 :                     goto no_output;
     166          13 :             else if (nUtf32 <= 0x10FFFF)
     167           5 :                 if (pDestBufEnd - pDestBufPtr >= 2)
     168             :                 {
     169           5 :                     *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
     170           5 :                     *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
     171             :                 }
     172             :                 else
     173           0 :                     goto no_output;
     174             :             else
     175             :             {
     176           8 :                 bUndefined = true;
     177           8 :                 goto bad_input;
     178             :             }
     179             :         }
     180    24449948 :         nShift = -1;
     181    24449948 :         bCheckBom = false;
     182    24449948 :         continue;
     183             : 
     184             :     bad_input:
     185        1318 :         switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
     186             :                     bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
     187        1318 :                     &nInfo))
     188             :         {
     189             :         case sal::detail::textenc::BAD_INPUT_STOP:
     190           0 :             nShift = -1;
     191           0 :             bCheckBom = false;
     192           0 :             if (!bConsume)
     193           0 :                 --pSrcBufPtr;
     194           0 :             break;
     195             : 
     196             :         case sal::detail::textenc::BAD_INPUT_CONTINUE:
     197        1262 :             nShift = -1;
     198        1262 :             bCheckBom = false;
     199        1262 :             if (!bConsume)
     200        1153 :                 --pSrcBufPtr;
     201        1262 :             continue;
     202             : 
     203             :         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     204          56 :             goto no_output;
     205             :         }
     206           0 :         break;
     207             : 
     208             :     no_output:
     209          56 :         --pSrcBufPtr;
     210          56 :         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
     211          56 :         break;
     212             :     }
     213             : 
     214     1216645 :     if (nShift >= 0
     215         103 :         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
     216             :                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
     217             :                == 0)
     218             :     {
     219          47 :         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
     220          47 :             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
     221             :         else
     222           0 :             switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
     223             :                         false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
     224           0 :                         &nInfo))
     225             :             {
     226             :             case sal::detail::textenc::BAD_INPUT_STOP:
     227             :             case sal::detail::textenc::BAD_INPUT_CONTINUE:
     228           0 :                 nShift = -1;
     229           0 :                 bCheckBom = false;
     230           0 :                 break;
     231             : 
     232             :             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     233           0 :                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
     234           0 :                 break;
     235             :             }
     236             :     }
     237             : 
     238     1216645 :     if (pContext != NULL)
     239             :     {
     240         629 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
     241         629 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
     242         629 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
     243             :     }
     244     1216645 :     if (pInfo != NULL)
     245     1216645 :         *pInfo = nInfo;
     246     1216645 :     if (pSrcCvtBytes != NULL)
     247     1216645 :         *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
     248     1216645 :     return pDestBufPtr - pDestBuf;
     249             : }
     250             : 
     251        1925 : void * ImplCreateUnicodeToUtf8Context()
     252             : {
     253        1925 :     ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
     254        1925 :     ImplResetUnicodeToUtf8Context(p);
     255        1925 :     return p;
     256             : }
     257             : 
     258        1925 : void ImplResetUnicodeToUtf8Context(void * pContext)
     259             : {
     260        1925 :     if (pContext != NULL)
     261        1925 :         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
     262        1925 : }
     263             : 
     264        1925 : void ImplDestroyUnicodeToUtf8Context(void * pContext)
     265             : {
     266        1925 :     delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
     267        1925 : }
     268             : 
     269      748011 : sal_Size ImplConvertUnicodeToUtf8(
     270             :     void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
     271             :     sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
     272             :     sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
     273             : {
     274      748011 :     int bJavaUtf8 = pData != NULL;
     275      748011 :     sal_Unicode nHighSurrogate = 0xFFFF;
     276      748011 :     sal_uInt32 nInfo = 0;
     277      748011 :     sal_Unicode const * pSrcBufPtr = pSrcBuf;
     278      748011 :     sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
     279      748011 :     char * pDestBufPtr = pDestBuf;
     280      748011 :     char * pDestBufEnd = pDestBufPtr + nDestBytes;
     281             : 
     282      748011 :     if (pContext != NULL)
     283             :         nHighSurrogate
     284       14790 :             = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
     285             : 
     286      748011 :     if (nHighSurrogate == 0xFFFF)
     287             :     {
     288      735146 :         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
     289           3 :             && !bJavaUtf8)
     290             :         {
     291           2 :             if (pDestBufEnd - pDestBufPtr >= 3)
     292             :             {
     293             :                 /* Write BOM (U+FEFF) as UTF-8: */
     294           2 :                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
     295           2 :                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
     296           2 :                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
     297             :             }
     298             :             else
     299             :             {
     300           0 :                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
     301           0 :                 goto done;
     302             :             }
     303             :         }
     304      735146 :         nHighSurrogate = 0;
     305             :     }
     306             : 
     307    41898649 :     while (pSrcBufPtr < pSrcBufEnd)
     308             :     {
     309    40402627 :         sal_uInt32 nChar = *pSrcBufPtr++;
     310    40402627 :         if (nHighSurrogate == 0)
     311             :         {
     312    40402626 :             if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
     313             :             {
     314           1 :                 nHighSurrogate = (sal_Unicode) nChar;
     315           1 :                 continue;
     316             :             }
     317             :         }
     318           1 :         else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
     319           1 :             nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
     320             :         else
     321           0 :             goto bad_input;
     322             : 
     323    80805253 :         if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
     324    80805252 :             || ImplIsNoncharacter(nChar))
     325           0 :             goto bad_input;
     326             : 
     327    40402626 :         if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
     328    80772526 :             if (pDestBufPtr != pDestBufEnd)
     329    40386263 :                 *pDestBufPtr++ = static_cast< char >(nChar);
     330             :             else
     331           0 :                 goto no_output;
     332       16363 :         else if (nChar <= 0x7FF)
     333        1671 :             if (pDestBufEnd - pDestBufPtr >= 2)
     334             :             {
     335        1671 :                 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
     336        1671 :                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
     337             :             }
     338             :             else
     339           0 :                 goto no_output;
     340       14692 :         else if (nChar <= 0xFFFF)
     341       14691 :             if (pDestBufEnd - pDestBufPtr >= 3)
     342             :             {
     343       14691 :                 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
     344       14691 :                 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
     345       14691 :                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
     346             :             }
     347             :             else
     348           0 :                 goto no_output;
     349           1 :         else if (pDestBufEnd - pDestBufPtr >= 4)
     350             :         {
     351           1 :             *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
     352           1 :             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
     353           1 :             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
     354           1 :             *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
     355             :         }
     356             :         else
     357           0 :             goto no_output;
     358    40402626 :         nHighSurrogate = 0;
     359    40402626 :         continue;
     360             : 
     361             :     bad_input:
     362           0 :         switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
     363             :                     false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, NULL,
     364           0 :                     0, NULL))
     365             :         {
     366             :         case sal::detail::textenc::BAD_INPUT_STOP:
     367           0 :             nHighSurrogate = 0;
     368           0 :             break;
     369             : 
     370             :         case sal::detail::textenc::BAD_INPUT_CONTINUE:
     371           0 :             nHighSurrogate = 0;
     372           0 :             continue;
     373             : 
     374             :         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     375           0 :             goto no_output;
     376             :         }
     377           0 :         break;
     378             : 
     379             :     no_output:
     380           0 :         --pSrcBufPtr;
     381           0 :         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
     382           0 :         break;
     383             :     }
     384             : 
     385      748011 :     if (nHighSurrogate != 0
     386           0 :         && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
     387             :                          | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
     388             :                == 0)
     389             :     {
     390           0 :         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
     391           0 :             nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
     392             :         else
     393           0 :             switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
     394             :                         false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
     395           0 :                         NULL, 0, NULL))
     396             :             {
     397             :             case sal::detail::textenc::BAD_INPUT_STOP:
     398             :             case sal::detail::textenc::BAD_INPUT_CONTINUE:
     399           0 :                 nHighSurrogate = 0;
     400           0 :                 break;
     401             : 
     402             :             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     403           0 :                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
     404           0 :                 break;
     405             :             }
     406             :     }
     407             : 
     408             :  done:
     409      748011 :     if (pContext != NULL)
     410             :         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
     411       14790 :             = nHighSurrogate;
     412      748011 :     if (pInfo != NULL)
     413      748011 :         *pInfo = nInfo;
     414      748011 :     if (pSrcCvtChars != NULL)
     415      748011 :         *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
     416      748011 :     return pDestBufPtr - pDestBuf;
     417             : }
     418             : 
     419             : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Generated by: LCOV version 1.10