LCOV - code coverage report
Current view: top level - i18npool/source/breakiterator - gendict.cxx (source / functions) Hit Total Coverage
Test: commit 10e77ab3ff6f4314137acd6e2702a6e5c1ce1fae Lines: 116 124 93.5 %
Date: 2014-11-03 Functions: 10 10 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2             : /*
       3             :  * This file is part of the LibreOffice project.
       4             :  *
       5             :  * This Source Code Form is subject to the terms of the Mozilla Public
       6             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       7             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       8             :  *
       9             :  * This file incorporates work covered by the following license notice:
      10             :  *
      11             :  *   Licensed to the Apache Software Foundation (ASF) under one or more
      12             :  *   contributor license agreements. See the NOTICE file distributed
      13             :  *   with this work for additional information regarding copyright
      14             :  *   ownership. The ASF licenses this file to you under the Apache
      15             :  *   License, Version 2.0 (the "License"); you may not use this file
      16             :  *   except in compliance with the License. You may obtain a copy of
      17             :  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
      18             :  */
      19             : 
      20             : #include <stdio.h>
      21             : #include <string.h>
      22             : #include <stdlib.h>
      23             : #include <errno.h>
      24             : #include <sal/main.h>
      25             : #include <sal/types.h>
      26             : #include <rtl/strbuf.hxx>
      27             : #include <rtl/ustring.hxx>
      28             : #include <osl/diagnose.h>
      29             : #include <vector>
      30             : 
      31             : using std::vector;
      32             : 
      33             : 
      34             : // For iOS, where we must strive for a minimal executable size, we
      35             : // keep the data produced by this utility not as large const tables in
      36             : // source code but instead as separate data files, to be bundled with
      37             : // an app, and mmapped in at run time.
      38             : 
      39             : // To test this easier on a desktop OS, just make sure
      40             : // DICT_JA_ZH_IN_DATAFILE is defined when building i18npool.
      41             : 
      42             : #ifdef DICT_JA_ZH_IN_DATAFILE
      43             : static sal_Int64 dataAreaOffset = 0;
      44             : static sal_Int64 lenArrayOffset = 0;
      45             : static sal_Int64 index1Offset = 0;
      46             : static sal_Int64 index2Offset = 0;
      47             : static sal_Int64 existMarkOffset = 0;
      48             : #endif
      49             : 
      50             : /* Utility gendict:
      51             : 
      52             :    "BreakIterator_CJK provides input string caching and dictionary searching for
      53             :    longest matching. You can provide a sorted dictionary (the encoding must be
      54             :    UTF-8) by creating the following file:
      55             :             i18npool/source/breakiterator/data/<language>.dict.
      56             : 
      57             :    The utility gendict will convert the file to C code, which will be compiled
      58             :    into a shared library for dynamic loading.
      59             : 
      60             :    All dictionary searching and loading is performed in the xdictionary class.
      61             :    The only thing you need to do is to derive your class from BreakIterator_CJK
      62             :    and create an instance of the xdictionary with the language name and
      63             :    pass it to the parent class." (from http://wiki.openoffice.org/wiki/
      64             :    /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011)
      65             : */
      66             : 
      67             : // C-standard garantees that static variables are automatically initialized to 0
      68             : static sal_uInt8 exists[0x2000];
      69             : static sal_uInt32 charArray[0x10000];
      70             : 
      71     1193526 : static inline void set_exists(sal_uInt32 index)
      72             : {
      73     1193526 :    exists[index>>3] |= 1 << (index & 0x07);
      74     1193526 : }
      75             : 
      76           2 : static inline void printIncludes(FILE* source_fp)
      77             : {
      78             : #ifndef DICT_JA_ZH_IN_DATAFILE
      79           2 :     fputs("/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n", source_fp);
      80           2 :     fputs("#include <sal/types.h>\n\n", source_fp);
      81             : #else
      82             :     (void) source_fp;
      83             : #endif
      84           2 : }
      85             : 
      86           2 : static inline void printFunctions(FILE* source_fp, const char *lang)
      87             : {
      88             : #ifndef DICT_JA_ZH_IN_DATAFILE
      89           2 :     fputs ("#ifndef DISABLE_DYNLOADING\n", source_fp);
      90           2 :     fputs ("SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark() { return existMark; }\n", source_fp);
      91           2 :     fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1() { return index1; }\n", source_fp);
      92           2 :     fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2() { return index2; }\n", source_fp);
      93           2 :     fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray() { return lenArray; }\n", source_fp);
      94           2 :     fputs ("SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea() { return dataArea; }\n", source_fp);
      95           2 :     fputs ("#else\n", source_fp);
      96           2 :     fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark_%s() { return existMark; }\n", lang);
      97           2 :     fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1_%s() { return index1; }\n", lang);
      98           2 :     fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2_%s() { return index2; }\n", lang);
      99           2 :     fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray_%s() { return lenArray; }\n", lang);
     100           2 :     fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea_%s() { return dataArea; }\n", lang);
     101           2 :     fputs ("#endif\n", source_fp);
     102             : #else
     103             :     (void) source_fp;
     104             :     (void) lang;
     105             : #endif
     106           2 : }
     107             : 
     108           2 : static inline void printDataArea(FILE *dictionary_fp, FILE *source_fp, vector<sal_uInt32>& lenArray)
     109             : {
     110             :     // generate main dict. data array
     111             : #ifndef DICT_JA_ZH_IN_DATAFILE
     112           2 :     fputs("static const sal_Unicode dataArea[] = {\n\t", source_fp);
     113             : #else
     114             :     dataAreaOffset = ftell(source_fp);
     115             : #endif
     116             :     sal_Char str[1024];
     117           2 :     sal_uInt32 lenArrayCurr = 0;
     118           2 :     sal_Unicode current = 0;
     119             : 
     120      371225 :     while (fgets(str, 1024, dictionary_fp)) {
     121             :         // input file is in UTF-8 encoding
     122             :         // don't convert last new line character to Ostr.
     123      371221 :         OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
     124      371221 :         const sal_Unicode *u = Ostr.getStr();
     125             : 
     126      371221 :         const sal_Int32 len = Ostr.getLength();
     127             : 
     128      371221 :         sal_Int32 i=0;
     129      371221 :         Ostr.iterateCodePoints(&i, 1);
     130      371221 :         if (len == i)
     131        4046 :             continue;   // skip one character word
     132             : 
     133      367175 :         if (u[0] != current) {
     134             :             OSL_ENSURE( (u[0] > current), "Dictionary file should be sorted");
     135       10872 :             current = u[0];
     136       10872 :             charArray[current] = lenArray.size();
     137             :         }
     138             : 
     139      367175 :         lenArray.push_back(lenArrayCurr);
     140             : 
     141      367175 :         set_exists(u[0]);
     142             :         // first character is stored in charArray, so start from second
     143     1193526 :         for (i = 1; i < len; i++, lenArrayCurr++) {
     144      826351 :             set_exists(u[i]);
     145             : #ifndef DICT_JA_ZH_IN_DATAFILE
     146      826351 :             fprintf(source_fp, "0x%04x, ", u[i]);
     147      826351 :             if ((lenArrayCurr & 0x0f) == 0x0f)
     148       51646 :                 fputs("\n\t", source_fp);
     149             : #else
     150             :             fwrite(&u[i], sizeof(u[i]), 1, source_fp);
     151             : #endif
     152             :         }
     153      367175 :     }
     154           2 :     lenArray.push_back( lenArrayCurr ); // store last ending pointer
     155           2 :     charArray[current+1] = lenArray.size();
     156             : #ifndef DICT_JA_ZH_IN_DATAFILE
     157           2 :     fputs("\n};\n", source_fp);
     158             : #endif
     159           2 : }
     160             : 
     161           2 : static inline void printLenArray(FILE* source_fp, const vector<sal_uInt32>& lenArray)
     162             : {
     163             : #ifndef DICT_JA_ZH_IN_DATAFILE
     164           2 :     fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t");
     165           2 :     fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
     166             : #else
     167             :     lenArrayOffset = ftell(source_fp);
     168             :     sal_uInt32 zero(0);
     169             :     fwrite(&zero, sizeof(zero), 1, source_fp);
     170             : #endif
     171      367179 :     for (size_t k = 0; k < lenArray.size(); k++)
     172             :     {
     173      367177 :         if( !(k & 0xf) )
     174       22950 :             fputs("\n\t", source_fp);
     175             : 
     176             : #ifndef DICT_JA_ZH_IN_DATAFILE
     177      367177 :         fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(lenArray[k]));
     178             : #else
     179             :         fwrite(&lenArray[k], sizeof(lenArray[k]), 1, source_fp);
     180             : #endif
     181             :     }
     182             : 
     183             : #ifndef DICT_JA_ZH_IN_DATAFILE
     184           2 :     fputs("\n};\n", source_fp );
     185             : #endif
     186           2 : }
     187             : 
     188             : /* FIXME?: what happens if in every range i there is at least one charArray != 0
     189             :        => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff }
     190             :        => then in index2, the last range will be ignored incorrectly */
     191           2 : static inline void printIndex1(FILE *source_fp, sal_Int16 *set)
     192             : {
     193             : #ifndef DICT_JA_ZH_IN_DATAFILE
     194           2 :     fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
     195             : #else
     196             :     index1Offset = ftell(source_fp);
     197             : #endif
     198             : 
     199           2 :     sal_Int16 count = 0;
     200         514 :     for (sal_Int32 i = 0; i < 0x100; i++) {
     201         512 :         sal_Int32 j = 0;
     202       90521 :         while( j < 0x100 && charArray[(i<<8) + j] == 0)
     203       89497 :             j++;
     204             : 
     205         512 :         set[i] = (j < 0x100 ? count++ : 0xff);
     206             : #ifndef DICT_JA_ZH_IN_DATAFILE
     207         512 :         fprintf(source_fp, "0x%02x, ", set[i]);
     208         512 :         if ((i & 0x0f) == 0x0f)
     209          32 :             fputs ("\n\t", source_fp);
     210             : #else
     211             :         fwrite(&set[i], sizeof(set[i]), 1, source_fp);
     212             : #endif
     213             :     }
     214             : 
     215             : #ifndef DICT_JA_ZH_IN_DATAFILE
     216           2 :     fputs("};\n", source_fp);
     217             : #endif
     218           2 : }
     219             : 
     220           2 : static inline void printIndex2(FILE *source_fp, sal_Int16 *set)
     221             : {
     222             : #ifndef DICT_JA_ZH_IN_DATAFILE
     223           2 :     fputs ("static const sal_Int32 index2[] = {\n\t", source_fp);
     224             : #else
     225             :     index2Offset = ftell(source_fp);
     226             : #endif
     227           2 :     sal_Int32 prev = 0;
     228         514 :     for (sal_Int32 i = 0; i < 0x100; i++) {
     229         512 :         if (set[i] != 0xff) {
     230       42919 :             for (sal_Int32 j = 0; j < 0x100; j++) {
     231       42752 :                 sal_Int32 k = (i<<8) + j;
     232       42752 :                 if (prev != 0 )
     233      121030 :                     while( k < 0x10000 && charArray[k] == 0 )
     234       99286 :                         k++;
     235             : 
     236       42752 :                 prev = charArray[(i<<8) + j];
     237             : #ifndef DICT_JA_ZH_IN_DATAFILE
     238       42752 :                 fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(k < 0x10000 ? charArray[k] + 1 : 0));
     239       42752 :                 if ((j & 0x0f) == 0x0f)
     240        2672 :                     fputs ("\n\t", source_fp);
     241             : #else
     242             :                 sal_uInt32 n = (k < 0x10000 ? charArray[k] + 1 : 0);
     243             :                 fwrite(&n, sizeof(n), 1, source_fp);
     244             : #endif
     245             :             }
     246             : #ifndef DICT_JA_ZH_IN_DATAFILE
     247         167 :             fputs ("\n\t", source_fp);
     248             : #endif
     249             :         }
     250             :     }
     251             : #ifndef DICT_JA_ZH_IN_DATAFILE
     252           2 :     fputs ("\n};\n", source_fp);
     253             : #endif
     254           2 : }
     255             : 
     256             : /* Generates a bitmask for the existence of sal_Unicode values in dictionary;
     257             :    it packs 8 sal_Bool values in 1 sal_uInt8 */
     258           2 : static inline void printExistsMask(FILE *source_fp)
     259             : {
     260             : #ifndef DICT_JA_ZH_IN_DATAFILE
     261           2 :     fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t");
     262             : #else
     263             :     existMarkOffset = ftell(source_fp);
     264             : #endif
     265       16386 :     for (unsigned int i = 0; i < 0x2000; i++)
     266             :     {
     267             : #ifndef DICT_JA_ZH_IN_DATAFILE
     268       16384 :         fprintf(source_fp, "0x%02x, ", exists[i]);
     269       16384 :         if ( (i & 0xf) == 0xf )
     270        1024 :             fputs("\n\t", source_fp);
     271             : #else
     272             :         fwrite(&exists[i], sizeof(exists[i]), 1, source_fp);
     273             : #endif
     274             :     }
     275             : 
     276             : #ifndef DICT_JA_ZH_IN_DATAFILE
     277           2 :     fputs("\n};\n", source_fp);
     278             : #endif
     279           2 : }
     280             : 
     281           4 : SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
     282             : {
     283             :     FILE *dictionary_fp, *source_fp;
     284             : 
     285           2 :     if (argc == 1 || argc > 4)
     286             :     {
     287           0 :         fputs("3 arguments required: dictionary_file_name source_file_name language_code", stderr);
     288           0 :         exit(-1);
     289             :     }
     290             : 
     291           2 :     dictionary_fp = fopen(argv[1], "rb");   // open the source file for read;
     292           2 :     if (dictionary_fp == NULL)
     293             :     {
     294           0 :         fprintf(stderr, "Opening the dictionary source file %s for reading failed: %s\n", argv[1], strerror(errno));
     295           0 :         exit(1);
     296             :     }
     297             : 
     298           2 :     if(argc == 2)
     299           0 :         source_fp = stdout;
     300             :     else
     301             :     {
     302             :         // create the C source file to write
     303           2 :         source_fp = fopen(argv[2], "wb");
     304           2 :         if (source_fp == NULL) {
     305           0 :             fclose(dictionary_fp);
     306           0 :             fprintf(stderr, "Opening %s for writing failed: %s\n", argv[2], strerror(errno));
     307           0 :             exit(1);
     308             :         }
     309             :     }
     310             : 
     311           2 :     vector<sal_uInt32> lenArray;   // stores the word boundaries in DataArea
     312             :     sal_Int16 set[0x100];
     313             : 
     314           2 :     printIncludes(source_fp);
     315             : #ifndef DICT_JA_ZH_IN_DATAFILE
     316           2 :     fputs("extern \"C\" {\n", source_fp);
     317             : #endif
     318           2 :     printDataArea(dictionary_fp, source_fp, lenArray);
     319           2 :     printLenArray(source_fp, lenArray);
     320           2 :     printIndex1(source_fp, set);
     321           2 :     printIndex2(source_fp, set);
     322           2 :     printExistsMask(source_fp);
     323           2 :     printFunctions(source_fp, argv[3]);
     324             : #ifndef DICT_JA_ZH_IN_DATAFILE
     325           2 :     fputs("}\n", source_fp);
     326             : #else
     327             :     // Put pointers to the tables at the end of the file...
     328             :     fwrite(&dataAreaOffset, sizeof(dataAreaOffset), 1, source_fp);
     329             :     fwrite(&lenArrayOffset, sizeof(lenArrayOffset), 1, source_fp);
     330             :     fwrite(&index1Offset, sizeof(index1Offset), 1, source_fp);
     331             :     fwrite(&index2Offset, sizeof(index2Offset), 1, source_fp);
     332             :     fwrite(&existMarkOffset, sizeof(existMarkOffset), 1, source_fp);
     333             : #endif
     334             : 
     335           2 :     fclose(dictionary_fp);
     336           2 :     fclose(source_fp);
     337             : 
     338           2 :     return 0;
     339             : }
     340             : 
     341             : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Generated by: LCOV version 1.10