Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include <stdio.h>
21 : #include <string.h>
22 : #include <stdlib.h>
23 : #include <errno.h>
24 : #include <sal/main.h>
25 : #include <sal/types.h>
26 : #include <rtl/strbuf.hxx>
27 : #include <rtl/ustring.hxx>
28 : #include <osl/diagnose.h>
29 : #include <vector>
30 :
31 : using std::vector;
32 :
33 :
34 : // For iOS, where we must strive for a minimal executable size, we
35 : // keep the data produced by this utility not as large const tables in
36 : // source code but instead as separate data files, to be bundled with
37 : // an app, and mmapped in at run time.
38 :
39 : // To test this easier on a desktop OS, just make sure
40 : // DICT_JA_ZH_IN_DATAFILE is defined when building i18npool.
41 :
42 : #ifdef DICT_JA_ZH_IN_DATAFILE
43 : static sal_Int64 dataAreaOffset = 0;
44 : static sal_Int64 lenArrayOffset = 0;
45 : static sal_Int64 index1Offset = 0;
46 : static sal_Int64 index2Offset = 0;
47 : static sal_Int64 existMarkOffset = 0;
48 : #endif
49 :
50 : /* Utility gendict:
51 :
52 : "BreakIterator_CJK provides input string caching and dictionary searching for
53 : longest matching. You can provide a sorted dictionary (the encoding must be
54 : UTF-8) by creating the following file:
55 : i18npool/source/breakiterator/data/<language>.dict.
56 :
57 : The utility gendict will convert the file to C code, which will be compiled
58 : into a shared library for dynamic loading.
59 :
60 : All dictionary searching and loading is performed in the xdictionary class.
61 : The only thing you need to do is to derive your class from BreakIterator_CJK
62 : and create an instance of the xdictionary with the language name and
63 : pass it to the parent class." (from http://wiki.openoffice.org/wiki/
64 : /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011)
65 : */
66 :
67 : // C-standard garantees that static variables are automatically initialized to 0
68 : static sal_uInt8 exists[0x2000];
69 : static sal_uInt32 charArray[0x10000];
70 :
71 1193526 : static inline void set_exists(sal_uInt32 index)
72 : {
73 1193526 : exists[index>>3] |= 1 << (index & 0x07);
74 1193526 : }
75 :
76 2 : static inline void printIncludes(FILE* source_fp)
77 : {
78 : #ifndef DICT_JA_ZH_IN_DATAFILE
79 2 : fputs("/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n", source_fp);
80 2 : fputs("#include <sal/types.h>\n\n", source_fp);
81 : #else
82 : (void) source_fp;
83 : #endif
84 2 : }
85 :
86 2 : static inline void printFunctions(FILE* source_fp, const char *lang)
87 : {
88 : #ifndef DICT_JA_ZH_IN_DATAFILE
89 2 : fputs ("#ifndef DISABLE_DYNLOADING\n", source_fp);
90 2 : fputs ("SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark() { return existMark; }\n", source_fp);
91 2 : fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1() { return index1; }\n", source_fp);
92 2 : fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2() { return index2; }\n", source_fp);
93 2 : fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray() { return lenArray; }\n", source_fp);
94 2 : fputs ("SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea() { return dataArea; }\n", source_fp);
95 2 : fputs ("#else\n", source_fp);
96 2 : fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark_%s() { return existMark; }\n", lang);
97 2 : fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1_%s() { return index1; }\n", lang);
98 2 : fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2_%s() { return index2; }\n", lang);
99 2 : fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray_%s() { return lenArray; }\n", lang);
100 2 : fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea_%s() { return dataArea; }\n", lang);
101 2 : fputs ("#endif\n", source_fp);
102 : #else
103 : (void) source_fp;
104 : (void) lang;
105 : #endif
106 2 : }
107 :
108 2 : static inline void printDataArea(FILE *dictionary_fp, FILE *source_fp, vector<sal_uInt32>& lenArray)
109 : {
110 : // generate main dict. data array
111 : #ifndef DICT_JA_ZH_IN_DATAFILE
112 2 : fputs("static const sal_Unicode dataArea[] = {\n\t", source_fp);
113 : #else
114 : dataAreaOffset = ftell(source_fp);
115 : #endif
116 : sal_Char str[1024];
117 2 : sal_uInt32 lenArrayCurr = 0;
118 2 : sal_Unicode current = 0;
119 :
120 371225 : while (fgets(str, 1024, dictionary_fp)) {
121 : // input file is in UTF-8 encoding
122 : // don't convert last new line character to Ostr.
123 371221 : OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
124 371221 : const sal_Unicode *u = Ostr.getStr();
125 :
126 371221 : const sal_Int32 len = Ostr.getLength();
127 :
128 371221 : sal_Int32 i=0;
129 371221 : Ostr.iterateCodePoints(&i, 1);
130 371221 : if (len == i)
131 4046 : continue; // skip one character word
132 :
133 367175 : if (u[0] != current) {
134 : OSL_ENSURE( (u[0] > current), "Dictionary file should be sorted");
135 10872 : current = u[0];
136 10872 : charArray[current] = lenArray.size();
137 : }
138 :
139 367175 : lenArray.push_back(lenArrayCurr);
140 :
141 367175 : set_exists(u[0]);
142 : // first character is stored in charArray, so start from second
143 1193526 : for (i = 1; i < len; i++, lenArrayCurr++) {
144 826351 : set_exists(u[i]);
145 : #ifndef DICT_JA_ZH_IN_DATAFILE
146 826351 : fprintf(source_fp, "0x%04x, ", u[i]);
147 826351 : if ((lenArrayCurr & 0x0f) == 0x0f)
148 51646 : fputs("\n\t", source_fp);
149 : #else
150 : fwrite(&u[i], sizeof(u[i]), 1, source_fp);
151 : #endif
152 : }
153 367175 : }
154 2 : lenArray.push_back( lenArrayCurr ); // store last ending pointer
155 2 : charArray[current+1] = lenArray.size();
156 : #ifndef DICT_JA_ZH_IN_DATAFILE
157 2 : fputs("\n};\n", source_fp);
158 : #endif
159 2 : }
160 :
161 2 : static inline void printLenArray(FILE* source_fp, const vector<sal_uInt32>& lenArray)
162 : {
163 : #ifndef DICT_JA_ZH_IN_DATAFILE
164 2 : fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t");
165 2 : fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
166 : #else
167 : lenArrayOffset = ftell(source_fp);
168 : sal_uInt32 zero(0);
169 : fwrite(&zero, sizeof(zero), 1, source_fp);
170 : #endif
171 367179 : for (size_t k = 0; k < lenArray.size(); k++)
172 : {
173 367177 : if( !(k & 0xf) )
174 22950 : fputs("\n\t", source_fp);
175 :
176 : #ifndef DICT_JA_ZH_IN_DATAFILE
177 367177 : fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(lenArray[k]));
178 : #else
179 : fwrite(&lenArray[k], sizeof(lenArray[k]), 1, source_fp);
180 : #endif
181 : }
182 :
183 : #ifndef DICT_JA_ZH_IN_DATAFILE
184 2 : fputs("\n};\n", source_fp );
185 : #endif
186 2 : }
187 :
188 : /* FIXME?: what happens if in every range i there is at least one charArray != 0
189 : => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff }
190 : => then in index2, the last range will be ignored incorrectly */
191 2 : static inline void printIndex1(FILE *source_fp, sal_Int16 *set)
192 : {
193 : #ifndef DICT_JA_ZH_IN_DATAFILE
194 2 : fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
195 : #else
196 : index1Offset = ftell(source_fp);
197 : #endif
198 :
199 2 : sal_Int16 count = 0;
200 514 : for (sal_Int32 i = 0; i < 0x100; i++) {
201 512 : sal_Int32 j = 0;
202 90521 : while( j < 0x100 && charArray[(i<<8) + j] == 0)
203 89497 : j++;
204 :
205 512 : set[i] = (j < 0x100 ? count++ : 0xff);
206 : #ifndef DICT_JA_ZH_IN_DATAFILE
207 512 : fprintf(source_fp, "0x%02x, ", set[i]);
208 512 : if ((i & 0x0f) == 0x0f)
209 32 : fputs ("\n\t", source_fp);
210 : #else
211 : fwrite(&set[i], sizeof(set[i]), 1, source_fp);
212 : #endif
213 : }
214 :
215 : #ifndef DICT_JA_ZH_IN_DATAFILE
216 2 : fputs("};\n", source_fp);
217 : #endif
218 2 : }
219 :
220 2 : static inline void printIndex2(FILE *source_fp, sal_Int16 *set)
221 : {
222 : #ifndef DICT_JA_ZH_IN_DATAFILE
223 2 : fputs ("static const sal_Int32 index2[] = {\n\t", source_fp);
224 : #else
225 : index2Offset = ftell(source_fp);
226 : #endif
227 2 : sal_Int32 prev = 0;
228 514 : for (sal_Int32 i = 0; i < 0x100; i++) {
229 512 : if (set[i] != 0xff) {
230 42919 : for (sal_Int32 j = 0; j < 0x100; j++) {
231 42752 : sal_Int32 k = (i<<8) + j;
232 42752 : if (prev != 0 )
233 121030 : while( k < 0x10000 && charArray[k] == 0 )
234 99286 : k++;
235 :
236 42752 : prev = charArray[(i<<8) + j];
237 : #ifndef DICT_JA_ZH_IN_DATAFILE
238 42752 : fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(k < 0x10000 ? charArray[k] + 1 : 0));
239 42752 : if ((j & 0x0f) == 0x0f)
240 2672 : fputs ("\n\t", source_fp);
241 : #else
242 : sal_uInt32 n = (k < 0x10000 ? charArray[k] + 1 : 0);
243 : fwrite(&n, sizeof(n), 1, source_fp);
244 : #endif
245 : }
246 : #ifndef DICT_JA_ZH_IN_DATAFILE
247 167 : fputs ("\n\t", source_fp);
248 : #endif
249 : }
250 : }
251 : #ifndef DICT_JA_ZH_IN_DATAFILE
252 2 : fputs ("\n};\n", source_fp);
253 : #endif
254 2 : }
255 :
256 : /* Generates a bitmask for the existence of sal_Unicode values in dictionary;
257 : it packs 8 sal_Bool values in 1 sal_uInt8 */
258 2 : static inline void printExistsMask(FILE *source_fp)
259 : {
260 : #ifndef DICT_JA_ZH_IN_DATAFILE
261 2 : fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t");
262 : #else
263 : existMarkOffset = ftell(source_fp);
264 : #endif
265 16386 : for (unsigned int i = 0; i < 0x2000; i++)
266 : {
267 : #ifndef DICT_JA_ZH_IN_DATAFILE
268 16384 : fprintf(source_fp, "0x%02x, ", exists[i]);
269 16384 : if ( (i & 0xf) == 0xf )
270 1024 : fputs("\n\t", source_fp);
271 : #else
272 : fwrite(&exists[i], sizeof(exists[i]), 1, source_fp);
273 : #endif
274 : }
275 :
276 : #ifndef DICT_JA_ZH_IN_DATAFILE
277 2 : fputs("\n};\n", source_fp);
278 : #endif
279 2 : }
280 :
281 4 : SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
282 : {
283 : FILE *dictionary_fp, *source_fp;
284 :
285 2 : if (argc == 1 || argc > 4)
286 : {
287 0 : fputs("3 arguments required: dictionary_file_name source_file_name language_code", stderr);
288 0 : exit(-1);
289 : }
290 :
291 2 : dictionary_fp = fopen(argv[1], "rb"); // open the source file for read;
292 2 : if (dictionary_fp == NULL)
293 : {
294 0 : fprintf(stderr, "Opening the dictionary source file %s for reading failed: %s\n", argv[1], strerror(errno));
295 0 : exit(1);
296 : }
297 :
298 2 : if(argc == 2)
299 0 : source_fp = stdout;
300 : else
301 : {
302 : // create the C source file to write
303 2 : source_fp = fopen(argv[2], "wb");
304 2 : if (source_fp == NULL) {
305 0 : fclose(dictionary_fp);
306 0 : fprintf(stderr, "Opening %s for writing failed: %s\n", argv[2], strerror(errno));
307 0 : exit(1);
308 : }
309 : }
310 :
311 2 : vector<sal_uInt32> lenArray; // stores the word boundaries in DataArea
312 : sal_Int16 set[0x100];
313 :
314 2 : printIncludes(source_fp);
315 : #ifndef DICT_JA_ZH_IN_DATAFILE
316 2 : fputs("extern \"C\" {\n", source_fp);
317 : #endif
318 2 : printDataArea(dictionary_fp, source_fp, lenArray);
319 2 : printLenArray(source_fp, lenArray);
320 2 : printIndex1(source_fp, set);
321 2 : printIndex2(source_fp, set);
322 2 : printExistsMask(source_fp);
323 2 : printFunctions(source_fp, argv[3]);
324 : #ifndef DICT_JA_ZH_IN_DATAFILE
325 2 : fputs("}\n", source_fp);
326 : #else
327 : // Put pointers to the tables at the end of the file...
328 : fwrite(&dataAreaOffset, sizeof(dataAreaOffset), 1, source_fp);
329 : fwrite(&lenArrayOffset, sizeof(lenArrayOffset), 1, source_fp);
330 : fwrite(&index1Offset, sizeof(index1Offset), 1, source_fp);
331 : fwrite(&index2Offset, sizeof(index2Offset), 1, source_fp);
332 : fwrite(&existMarkOffset, sizeof(existMarkOffset), 1, source_fp);
333 : #endif
334 :
335 2 : fclose(dictionary_fp);
336 2 : fclose(source_fp);
337 :
338 2 : return 0;
339 : }
340 :
341 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|