LCOV - code coverage report
Current view: top level - i18npool/source/breakiterator - gendict.cxx (source / functions) Hit Total Coverage
Test: libreoffice_filtered.info Lines: 115 123 93.5 %
Date: 2012-08-25 Functions: 10 10 100.0 %
Legend: Lines: hit not hit | Branches: + taken - not taken # not executed Branches: 78 114 68.4 %

           Branch data     Line data    Source code
       1                 :            : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2                 :            : /*
       3                 :            :  * This file is part of the LibreOffice project.
       4                 :            :  *
       5                 :            :  * This Source Code Form is subject to the terms of the Mozilla Public
       6                 :            :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       7                 :            :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       8                 :            :  *
       9                 :            :  * This file incorporates work covered by the following license notice:
      10                 :            :  *
      11                 :            :  *   Licensed to the Apache Software Foundation (ASF) under one or more
      12                 :            :  *   contributor license agreements. See the NOTICE file distributed
      13                 :            :  *   with this work for additional information regarding copyright
      14                 :            :  *   ownership. The ASF licenses this file to you under the Apache
      15                 :            :  *   License, Version 2.0 (the "License"); you may not use this file
      16                 :            :  *   except in compliance with the License. You may obtain a copy of
      17                 :            :  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
      18                 :            :  */
      19                 :            : 
      20                 :            : 
      21                 :            : #include <stdio.h>
      22                 :            : #include <string.h>
      23                 :            : #include <stdlib.h>
      24                 :            : #include <errno.h>
      25                 :            : #include <sal/main.h>
      26                 :            : #include <sal/types.h>
      27                 :            : #include <rtl/strbuf.hxx>
      28                 :            : #include <rtl/ustring.hxx>
      29                 :            : #include <osl/diagnose.h>
      30                 :            : #include <vector>
      31                 :            : using std::vector;
      32                 :            : 
      33                 :            : using namespace ::rtl;
      34                 :            : 
      35                 :            : /* Utility gendict:
      36                 :            : 
      37                 :            :    "BreakIterator_CJK provides input string caching and dictionary searching for
      38                 :            :    longest matching. You can provide a sorted dictionary (the encoding must be
      39                 :            :    UTF-8) by creating the following file:
      40                 :            :             i18npool/source/breakiterator/data/<language>.dict.
      41                 :            : 
      42                 :            :    The utility gendict will convert the file to C code, which will be compiled
      43                 :            :    into a shared library for dynamic loading.
      44                 :            : 
      45                 :            :    All dictionary searching and loading is performed in the xdictionary class.
      46                 :            :    The only thing you need to do is to derive your class from BreakIterator_CJK
      47                 :            :    and create an instance of the xdictionary with the language name and
      48                 :            :    pass it to the parent class." (from http://wiki.services.openoffice.org/wiki/
      49                 :            :    /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011)
      50                 :            : */
      51                 :            : 
      52                 :            : // C-standard garantees that static variables are automatically initialized to 0
      53                 :            : static sal_uInt8 exists[0x2000];
      54                 :            : static sal_uInt32 charArray[0x10000];
      55                 :            : 
      56                 :    1193526 : static inline void set_exists(sal_uInt32 index)
      57                 :            : {
      58                 :    1193526 :    exists[index>>3] |= 1 << (index & 0x07);
      59                 :    1193526 : }
      60                 :            : 
      61                 :          2 : static inline void printIncludes(FILE* source_fp)
      62                 :            : {
      63                 :          2 :     fputs("/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n", source_fp);
      64                 :          2 :     fputs("#include <sal/types.h>\n\n", source_fp);
      65                 :          2 : }
      66                 :            : 
      67                 :          2 : static inline void printFunctions(FILE* source_fp, const char *lang)
      68                 :            : {
      69                 :          2 :     fputs ("#ifndef DISABLE_DYNLOADING\n", source_fp);
      70                 :          2 :     fputs ("SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark() { return existMark; }\n", source_fp);
      71                 :          2 :     fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1() { return index1; }\n", source_fp);
      72                 :          2 :     fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2() { return index2; }\n", source_fp);
      73                 :          2 :     fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray() { return lenArray; }\n", source_fp);
      74                 :          2 :     fputs ("SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea() { return dataArea; }\n", source_fp);
      75                 :          2 :     fputs ("#else\n", source_fp);
      76                 :          2 :     fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark_%s() { return existMark; }\n", lang);
      77                 :          2 :     fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1_%s() { return index1; }\n", lang);
      78                 :          2 :     fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2_%s() { return index2; }\n", lang);
      79                 :          2 :     fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray_%s() { return lenArray; }\n", lang);
      80                 :          2 :     fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea_%s() { return dataArea; }\n", lang);
      81                 :          2 :     fputs ("#endif\n", source_fp);
      82                 :          2 : }
      83                 :            : 
      84                 :          2 : static inline void printDataArea(FILE *dictionary_fp, FILE *source_fp, vector<sal_uInt32>& lenArray)
      85                 :            : {
      86                 :            :     // generate main dict. data array
      87         [ +  - ]:          2 :     fputs("static const sal_Unicode dataArea[] = {\n\t", source_fp);
      88                 :            :     sal_Char str[1024];
      89                 :          2 :     sal_uInt32 lenArrayCurr = 0;
      90                 :          2 :     sal_Unicode current = 0;
      91                 :            : 
      92 [ +  - ][ +  + ]:     371223 :     while (fgets(str, 1024, dictionary_fp)) {
      93                 :            :         // input file is in UTF-8 encoding
      94                 :            :         // don't convert last new line character to Ostr.
      95         [ +  - ]:     371221 :         OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
      96                 :     371221 :         const sal_Unicode *u = Ostr.getStr();
      97                 :            : 
      98                 :     371221 :         const sal_Int32 len = Ostr.getLength();
      99                 :            : 
     100                 :     371221 :         sal_Int32 i=0;
     101         [ +  - ]:     371221 :         Ostr.iterateCodePoints(&i, 1);
     102         [ +  + ]:     371221 :         if (len == i)
     103                 :       4046 :             continue;   // skip one character word
     104                 :            : 
     105         [ +  + ]:     367175 :         if (u[0] != current) {
     106                 :            :             OSL_ENSURE( (u[0] > current), "Dictionary file should be sorted");
     107                 :      10872 :             current = u[0];
     108                 :      10872 :             charArray[current] = lenArray.size();
     109                 :            :         }
     110                 :            : 
     111         [ +  - ]:     367175 :         lenArray.push_back(lenArrayCurr);
     112                 :            : 
     113                 :     367175 :         set_exists(u[0]);
     114                 :            :         // first character is stored in charArray, so start from second
     115         [ +  + ]:    1193526 :         for (i = 1; i < len; i++, lenArrayCurr++) {
     116                 :     826351 :             set_exists(u[i]);
     117         [ +  - ]:     826351 :             fprintf(source_fp, "0x%04x, ", u[i]);
     118         [ +  + ]:     826351 :             if ((lenArrayCurr & 0x0f) == 0x0f)
     119         [ +  - ]:      51646 :                 fputs("\n\t", source_fp);
     120                 :            :         }
     121         [ +  + ]:     371221 :     }
     122         [ +  - ]:          2 :     lenArray.push_back( lenArrayCurr ); // store last ending pointer
     123                 :          2 :     charArray[current+1] = lenArray.size();
     124         [ +  - ]:          2 :     fputs("\n};\n", source_fp);
     125                 :          2 : }
     126                 :            : 
     127                 :          2 : static inline void printLenArray(FILE* source_fp, const vector<sal_uInt32>& lenArray)
     128                 :            : {
     129                 :          2 :     fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t");
     130                 :          2 :     fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
     131         [ +  + ]:     367179 :     for (size_t k = 0; k < lenArray.size(); k++)
     132                 :            :     {
     133         [ +  + ]:     367177 :         if( !(k & 0xf) )
     134                 :      22950 :             fputs("\n\t", source_fp);
     135                 :            : 
     136                 :     367177 :         fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(lenArray[k]));
     137                 :            :     }
     138                 :          2 :     fputs("\n};\n", source_fp );
     139                 :          2 : }
     140                 :            : 
     141                 :            : /* FIXME?: what happens if in every range i there is at least one charArray != 0
     142                 :            :        => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff }
     143                 :            :        => then in index2, the last range will be ignored incorrectly */
     144                 :          2 : static inline void printIndex1(FILE *source_fp, sal_Int16 *set)
     145                 :            : {
     146                 :          2 :     fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
     147                 :          2 :     sal_Int16 count = 0;
     148         [ +  + ]:        514 :     for (sal_Int32 i = 0; i < 0x100; i++) {
     149                 :        512 :         sal_Int32 j = 0;
     150 [ +  + ][ +  + ]:      90009 :         while( j < 0x100 && charArray[(i<<8) + j] == 0)
                 [ +  + ]
     151                 :      89497 :             j++;
     152                 :            : 
     153         [ +  + ]:        512 :         fprintf(source_fp, "0x%02x, ", set[i] = (j < 0x100 ? count++ : 0xff));
     154         [ +  + ]:        512 :         if ((i & 0x0f) == 0x0f)
     155                 :         32 :             fputs ("\n\t", source_fp);
     156                 :            :     }
     157                 :          2 :     fputs("};\n", source_fp);
     158                 :          2 : }
     159                 :            : 
     160                 :          2 : static inline void printIndex2(FILE *source_fp, sal_Int16 *set)
     161                 :            : {
     162                 :          2 :     fputs ("static const sal_Int32 index2[] = {\n\t", source_fp);
     163                 :          2 :     sal_Int32 prev = 0;
     164         [ +  + ]:        514 :     for (sal_Int32 i = 0; i < 0x100; i++) {
     165         [ +  + ]:        512 :         if (set[i] != 0xff) {
     166         [ +  + ]:      42919 :             for (sal_Int32 j = 0; j < 0x100; j++) {
     167                 :      42752 :                 sal_Int32 k = (i<<8) + j;
     168         [ +  + ]:      42752 :                 if (prev != 0 )
     169 [ +  + ][ +  + ]:     110158 :                     while( k < 0x10000 && charArray[k] == 0 )
                 [ +  + ]
     170                 :      99286 :                         k++;
     171                 :            : 
     172                 :      42752 :                 prev = charArray[(i<<8) + j];
     173         [ +  + ]:      42752 :                 fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(k < 0x10000 ? charArray[k] + 1 : 0));
     174         [ +  + ]:      42752 :                 if ((j & 0x0f) == 0x0f)
     175                 :       2672 :                     fputs ("\n\t", source_fp);
     176                 :            :             }
     177                 :        167 :             fputs ("\n\t", source_fp);
     178                 :            :         }
     179                 :            :     }
     180                 :          2 :     fputs ("\n};\n", source_fp);
     181                 :          2 : }
     182                 :            : 
     183                 :            : /* Generates a bitmask for the existance of sal_Unicode values in dictionary;
     184                 :            :    it packs 8 sal_Bool values in 1 sal_uInt8 */
     185                 :          2 : static inline void printExistsMask(FILE *source_fp)
     186                 :            : {
     187                 :          2 :     fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t");
     188         [ +  + ]:      16386 :     for (unsigned int i = 0; i < 0x2000; i++)
     189                 :            :     {
     190                 :      16384 :         fprintf(source_fp, "0x%02x, ", exists[i]);
     191         [ +  + ]:      16384 :         if ( (i & 0xf) == 0xf )
     192                 :       1024 :             fputs("\n\t", source_fp);
     193                 :            :     }
     194                 :          2 :     fputs("\n};\n", source_fp);
     195                 :          2 : }
     196                 :            : 
     197                 :          2 : SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
     198                 :            : {
     199                 :            :     FILE *dictionary_fp, *source_fp;
     200                 :            : 
     201 [ +  - ][ -  + ]:          2 :     if (argc == 1 || argc > 4)
     202                 :            :     {
     203         [ #  # ]:          0 :         fputs("3 arguments required: dictionary_file_name source_file_name language_code", stderr);
     204                 :          0 :         exit(-1);
     205                 :            :     }
     206                 :            : 
     207         [ +  - ]:          2 :     dictionary_fp = fopen(argv[1], "rb");   // open the source file for read;
     208         [ -  + ]:          2 :     if (dictionary_fp == NULL)
     209                 :            :     {
     210         [ #  # ]:          0 :         fprintf(stderr, "Opening the dictionary source file %s for reading failed: %s\n", argv[1], strerror(errno));
     211                 :          0 :         exit(1);
     212                 :            :     }
     213                 :            : 
     214         [ -  + ]:          2 :     if(argc == 2)
     215                 :          0 :         source_fp = stdout;
     216                 :            :     else
     217                 :            :     {
     218                 :            :         // create the C source file to write
     219         [ +  - ]:          2 :         source_fp = fopen(argv[2], "wb");
     220         [ -  + ]:          2 :         if (source_fp == NULL) {
     221         [ #  # ]:          0 :             fclose(dictionary_fp);
     222         [ #  # ]:          0 :             fprintf(stderr, "Opening %s for writing failed: %s\n", argv[2], strerror(errno));
     223                 :          0 :             exit(1);
     224                 :            :         }
     225                 :            :     }
     226                 :            : 
     227         [ +  - ]:          2 :     vector<sal_uInt32> lenArray;   // stores the word boundaries in DataArea
     228                 :            :     sal_Int16 set[0x100];
     229                 :            : 
     230         [ +  - ]:          2 :     printIncludes(source_fp);
     231         [ +  - ]:          2 :     fputs("extern \"C\" {\n", source_fp);
     232         [ +  - ]:          2 :         printDataArea(dictionary_fp, source_fp, lenArray);
     233         [ +  - ]:          2 :         printLenArray(source_fp, lenArray);
     234         [ +  - ]:          2 :         printIndex1(source_fp, set);
     235         [ +  - ]:          2 :         printIndex2(source_fp, set);
     236         [ +  - ]:          2 :         printExistsMask(source_fp);
     237         [ +  - ]:          2 :         printFunctions(source_fp, argv[3]);
     238         [ +  - ]:          2 :     fputs("}\n", source_fp);
     239                 :            : 
     240         [ +  - ]:          2 :     fclose(dictionary_fp);
     241         [ +  - ]:          2 :     fclose(source_fp);
     242                 :            : 
     243                 :          2 :     return 0;
     244                 :            : }
     245                 :            : 
     246                 :            : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Generated by: LCOV version 1.10