Branch data Line data Source code
1 : : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : : /*
3 : : * This file is part of the LibreOffice project.
4 : : *
5 : : * This Source Code Form is subject to the terms of the Mozilla Public
6 : : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : : *
9 : : * This file incorporates work covered by the following license notice:
10 : : *
11 : : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : : * contributor license agreements. See the NOTICE file distributed
13 : : * with this work for additional information regarding copyright
14 : : * ownership. The ASF licenses this file to you under the Apache
15 : : * License, Version 2.0 (the "License"); you may not use this file
16 : : * except in compliance with the License. You may obtain a copy of
17 : : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : : */
19 : :
20 : :
21 : : #include <stdio.h>
22 : : #include <string.h>
23 : : #include <stdlib.h>
24 : : #include <errno.h>
25 : : #include <sal/main.h>
26 : : #include <sal/types.h>
27 : : #include <rtl/strbuf.hxx>
28 : : #include <rtl/ustring.hxx>
29 : : #include <osl/diagnose.h>
30 : : #include <vector>
31 : : using std::vector;
32 : :
33 : : using namespace ::rtl;
34 : :
35 : : /* Utility gendict:
36 : :
37 : : "BreakIterator_CJK provides input string caching and dictionary searching for
38 : : longest matching. You can provide a sorted dictionary (the encoding must be
39 : : UTF-8) by creating the following file:
40 : : i18npool/source/breakiterator/data/<language>.dict.
41 : :
42 : : The utility gendict will convert the file to C code, which will be compiled
43 : : into a shared library for dynamic loading.
44 : :
45 : : All dictionary searching and loading is performed in the xdictionary class.
46 : : The only thing you need to do is to derive your class from BreakIterator_CJK
47 : : and create an instance of the xdictionary with the language name and
48 : : pass it to the parent class." (from http://wiki.services.openoffice.org/wiki/
49 : : /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011)
50 : : */
51 : :
52 : : // C-standard garantees that static variables are automatically initialized to 0
53 : : static sal_uInt8 exists[0x2000];
54 : : static sal_uInt32 charArray[0x10000];
55 : :
56 : 1193526 : static inline void set_exists(sal_uInt32 index)
57 : : {
58 : 1193526 : exists[index>>3] |= 1 << (index & 0x07);
59 : 1193526 : }
60 : :
61 : 2 : static inline void printIncludes(FILE* source_fp)
62 : : {
63 : 2 : fputs("/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n", source_fp);
64 : 2 : fputs("#include <sal/types.h>\n\n", source_fp);
65 : 2 : }
66 : :
67 : 2 : static inline void printFunctions(FILE* source_fp, const char *lang)
68 : : {
69 : 2 : fputs ("#ifndef DISABLE_DYNLOADING\n", source_fp);
70 : 2 : fputs ("SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark() { return existMark; }\n", source_fp);
71 : 2 : fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1() { return index1; }\n", source_fp);
72 : 2 : fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2() { return index2; }\n", source_fp);
73 : 2 : fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray() { return lenArray; }\n", source_fp);
74 : 2 : fputs ("SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea() { return dataArea; }\n", source_fp);
75 : 2 : fputs ("#else\n", source_fp);
76 : 2 : fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark_%s() { return existMark; }\n", lang);
77 : 2 : fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1_%s() { return index1; }\n", lang);
78 : 2 : fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2_%s() { return index2; }\n", lang);
79 : 2 : fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray_%s() { return lenArray; }\n", lang);
80 : 2 : fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea_%s() { return dataArea; }\n", lang);
81 : 2 : fputs ("#endif\n", source_fp);
82 : 2 : }
83 : :
84 : 2 : static inline void printDataArea(FILE *dictionary_fp, FILE *source_fp, vector<sal_uInt32>& lenArray)
85 : : {
86 : : // generate main dict. data array
87 [ + - ]: 2 : fputs("static const sal_Unicode dataArea[] = {\n\t", source_fp);
88 : : sal_Char str[1024];
89 : 2 : sal_uInt32 lenArrayCurr = 0;
90 : 2 : sal_Unicode current = 0;
91 : :
92 [ + - ][ + + ]: 371223 : while (fgets(str, 1024, dictionary_fp)) {
93 : : // input file is in UTF-8 encoding
94 : : // don't convert last new line character to Ostr.
95 [ + - ]: 371221 : OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
96 : 371221 : const sal_Unicode *u = Ostr.getStr();
97 : :
98 : 371221 : const sal_Int32 len = Ostr.getLength();
99 : :
100 : 371221 : sal_Int32 i=0;
101 [ + - ]: 371221 : Ostr.iterateCodePoints(&i, 1);
102 [ + + ]: 371221 : if (len == i)
103 : 4046 : continue; // skip one character word
104 : :
105 [ + + ]: 367175 : if (u[0] != current) {
106 : : OSL_ENSURE( (u[0] > current), "Dictionary file should be sorted");
107 : 10872 : current = u[0];
108 : 10872 : charArray[current] = lenArray.size();
109 : : }
110 : :
111 [ + - ]: 367175 : lenArray.push_back(lenArrayCurr);
112 : :
113 : 367175 : set_exists(u[0]);
114 : : // first character is stored in charArray, so start from second
115 [ + + ]: 1193526 : for (i = 1; i < len; i++, lenArrayCurr++) {
116 : 826351 : set_exists(u[i]);
117 [ + - ]: 826351 : fprintf(source_fp, "0x%04x, ", u[i]);
118 [ + + ]: 826351 : if ((lenArrayCurr & 0x0f) == 0x0f)
119 [ + - ]: 51646 : fputs("\n\t", source_fp);
120 : : }
121 [ + + ]: 371221 : }
122 [ + - ]: 2 : lenArray.push_back( lenArrayCurr ); // store last ending pointer
123 : 2 : charArray[current+1] = lenArray.size();
124 [ + - ]: 2 : fputs("\n};\n", source_fp);
125 : 2 : }
126 : :
127 : 2 : static inline void printLenArray(FILE* source_fp, const vector<sal_uInt32>& lenArray)
128 : : {
129 : 2 : fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t");
130 : 2 : fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
131 [ + + ]: 367179 : for (size_t k = 0; k < lenArray.size(); k++)
132 : : {
133 [ + + ]: 367177 : if( !(k & 0xf) )
134 : 22950 : fputs("\n\t", source_fp);
135 : :
136 : 367177 : fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(lenArray[k]));
137 : : }
138 : 2 : fputs("\n};\n", source_fp );
139 : 2 : }
140 : :
141 : : /* FIXME?: what happens if in every range i there is at least one charArray != 0
142 : : => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff }
143 : : => then in index2, the last range will be ignored incorrectly */
144 : 2 : static inline void printIndex1(FILE *source_fp, sal_Int16 *set)
145 : : {
146 : 2 : fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
147 : 2 : sal_Int16 count = 0;
148 [ + + ]: 514 : for (sal_Int32 i = 0; i < 0x100; i++) {
149 : 512 : sal_Int32 j = 0;
150 [ + + ][ + + ]: 90009 : while( j < 0x100 && charArray[(i<<8) + j] == 0)
[ + + ]
151 : 89497 : j++;
152 : :
153 [ + + ]: 512 : fprintf(source_fp, "0x%02x, ", set[i] = (j < 0x100 ? count++ : 0xff));
154 [ + + ]: 512 : if ((i & 0x0f) == 0x0f)
155 : 32 : fputs ("\n\t", source_fp);
156 : : }
157 : 2 : fputs("};\n", source_fp);
158 : 2 : }
159 : :
160 : 2 : static inline void printIndex2(FILE *source_fp, sal_Int16 *set)
161 : : {
162 : 2 : fputs ("static const sal_Int32 index2[] = {\n\t", source_fp);
163 : 2 : sal_Int32 prev = 0;
164 [ + + ]: 514 : for (sal_Int32 i = 0; i < 0x100; i++) {
165 [ + + ]: 512 : if (set[i] != 0xff) {
166 [ + + ]: 42919 : for (sal_Int32 j = 0; j < 0x100; j++) {
167 : 42752 : sal_Int32 k = (i<<8) + j;
168 [ + + ]: 42752 : if (prev != 0 )
169 [ + + ][ + + ]: 110158 : while( k < 0x10000 && charArray[k] == 0 )
[ + + ]
170 : 99286 : k++;
171 : :
172 : 42752 : prev = charArray[(i<<8) + j];
173 [ + + ]: 42752 : fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(k < 0x10000 ? charArray[k] + 1 : 0));
174 [ + + ]: 42752 : if ((j & 0x0f) == 0x0f)
175 : 2672 : fputs ("\n\t", source_fp);
176 : : }
177 : 167 : fputs ("\n\t", source_fp);
178 : : }
179 : : }
180 : 2 : fputs ("\n};\n", source_fp);
181 : 2 : }
182 : :
183 : : /* Generates a bitmask for the existance of sal_Unicode values in dictionary;
184 : : it packs 8 sal_Bool values in 1 sal_uInt8 */
185 : 2 : static inline void printExistsMask(FILE *source_fp)
186 : : {
187 : 2 : fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t");
188 [ + + ]: 16386 : for (unsigned int i = 0; i < 0x2000; i++)
189 : : {
190 : 16384 : fprintf(source_fp, "0x%02x, ", exists[i]);
191 [ + + ]: 16384 : if ( (i & 0xf) == 0xf )
192 : 1024 : fputs("\n\t", source_fp);
193 : : }
194 : 2 : fputs("\n};\n", source_fp);
195 : 2 : }
196 : :
197 : 2 : SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
198 : : {
199 : : FILE *dictionary_fp, *source_fp;
200 : :
201 [ + - ][ - + ]: 2 : if (argc == 1 || argc > 4)
202 : : {
203 [ # # ]: 0 : fputs("3 arguments required: dictionary_file_name source_file_name language_code", stderr);
204 : 0 : exit(-1);
205 : : }
206 : :
207 [ + - ]: 2 : dictionary_fp = fopen(argv[1], "rb"); // open the source file for read;
208 [ - + ]: 2 : if (dictionary_fp == NULL)
209 : : {
210 [ # # ]: 0 : fprintf(stderr, "Opening the dictionary source file %s for reading failed: %s\n", argv[1], strerror(errno));
211 : 0 : exit(1);
212 : : }
213 : :
214 [ - + ]: 2 : if(argc == 2)
215 : 0 : source_fp = stdout;
216 : : else
217 : : {
218 : : // create the C source file to write
219 [ + - ]: 2 : source_fp = fopen(argv[2], "wb");
220 [ - + ]: 2 : if (source_fp == NULL) {
221 [ # # ]: 0 : fclose(dictionary_fp);
222 [ # # ]: 0 : fprintf(stderr, "Opening %s for writing failed: %s\n", argv[2], strerror(errno));
223 : 0 : exit(1);
224 : : }
225 : : }
226 : :
227 [ + - ]: 2 : vector<sal_uInt32> lenArray; // stores the word boundaries in DataArea
228 : : sal_Int16 set[0x100];
229 : :
230 [ + - ]: 2 : printIncludes(source_fp);
231 [ + - ]: 2 : fputs("extern \"C\" {\n", source_fp);
232 [ + - ]: 2 : printDataArea(dictionary_fp, source_fp, lenArray);
233 [ + - ]: 2 : printLenArray(source_fp, lenArray);
234 [ + - ]: 2 : printIndex1(source_fp, set);
235 [ + - ]: 2 : printIndex2(source_fp, set);
236 [ + - ]: 2 : printExistsMask(source_fp);
237 [ + - ]: 2 : printFunctions(source_fp, argv[3]);
238 [ + - ]: 2 : fputs("}\n", source_fp);
239 : :
240 [ + - ]: 2 : fclose(dictionary_fp);
241 [ + - ]: 2 : fclose(source_fp);
242 : :
243 : 2 : return 0;
244 : : }
245 : :
246 : : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|