Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : */
9 :
10 : #include <helpcompiler/HelpIndexer.hxx>
11 :
12 : #include <rtl/string.hxx>
13 : #include <rtl/uri.hxx>
14 : #include <rtl/ustrbuf.hxx>
15 : #include <osl/file.hxx>
16 : #include <osl/thread.h>
17 : #include <boost/scoped_ptr.hpp>
18 : #include <algorithm>
19 :
20 : #include "LuceneHelper.hxx"
21 :
22 : using namespace lucene::document;
23 :
24 0 : HelpIndexer::HelpIndexer(OUString const &lang, OUString const &module,
25 : OUString const &srcDir, OUString const &outDir)
26 0 : : d_lang(lang), d_module(module)
27 : {
28 0 : d_indexDir = OUStringBuffer(outDir).append('/').
29 0 : append(module).append(".idxl").makeStringAndClear();
30 0 : d_captionDir = srcDir + "/caption";
31 0 : d_contentDir = srcDir + "/content";
32 0 : }
33 :
34 0 : bool HelpIndexer::indexDocuments()
35 : {
36 0 : if (!scanForFiles())
37 0 : return false;
38 :
39 : try
40 : {
41 0 : OUString sLang = d_lang.getToken(0, '-');
42 0 : bool bUseCJK = sLang == "ja" || sLang == "ko" || sLang == "zh";
43 :
44 : // Construct the analyzer appropriate for the given language
45 0 : boost::scoped_ptr<lucene::analysis::Analyzer> analyzer;
46 0 : if (bUseCJK)
47 0 : analyzer.reset(new lucene::analysis::LanguageBasedAnalyzer(L"cjk"));
48 : else
49 0 : analyzer.reset(new lucene::analysis::standard::StandardAnalyzer());
50 :
51 0 : OUString ustrSystemPath;
52 0 : osl::File::getSystemPathFromFileURL(d_indexDir, ustrSystemPath);
53 :
54 0 : OString indexDirStr = OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
55 0 : lucene::index::IndexWriter writer(indexDirStr.getStr(), analyzer.get(), true);
56 : //Double limit of tokens allowed, otherwise we'll get a too-many-tokens
57 : //exception for ja help. Could alternative ignore the exception and get
58 : //truncated results as per java-Lucene apparently
59 0 : writer.setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH*2);
60 :
61 : // Index the identified help files
62 0 : Document doc;
63 0 : for (std::set<OUString>::iterator i = d_files.begin(); i != d_files.end(); ++i) {
64 0 : helpDocument(*i, &doc);
65 0 : writer.addDocument(&doc);
66 0 : doc.clear();
67 : }
68 0 : writer.optimize();
69 :
70 : // Optimize the index
71 0 : writer.optimize();
72 : }
73 0 : catch (CLuceneError &e)
74 : {
75 0 : d_error = OUString::createFromAscii(e.what());
76 0 : return false;
77 : }
78 :
79 0 : return true;
80 : }
81 :
82 0 : OUString const & HelpIndexer::getErrorMessage() {
83 0 : return d_error;
84 : }
85 :
86 0 : bool HelpIndexer::scanForFiles() {
87 0 : if (!scanForFiles(d_contentDir)) {
88 0 : return false;
89 : }
90 0 : if (!scanForFiles(d_captionDir)) {
91 0 : return false;
92 : }
93 0 : return true;
94 : }
95 :
96 0 : bool HelpIndexer::scanForFiles(OUString const & path) {
97 :
98 0 : osl::Directory dir(path);
99 0 : if (osl::FileBase::E_None != dir.open()) {
100 0 : d_error = "Error reading directory " + path;
101 0 : return true;
102 : }
103 :
104 0 : osl::DirectoryItem item;
105 0 : osl::FileStatus fileStatus(osl_FileStatus_Mask_FileName | osl_FileStatus_Mask_Type);
106 0 : while (dir.getNextItem(item) == osl::FileBase::E_None) {
107 0 : item.getFileStatus(fileStatus);
108 0 : if (fileStatus.getFileType() == osl::FileStatus::Regular) {
109 0 : d_files.insert(fileStatus.getFileName());
110 : }
111 : }
112 :
113 0 : return true;
114 : }
115 :
116 0 : bool HelpIndexer::helpDocument(OUString const & fileName, Document *doc) {
117 : // Add the help path as an indexed, untokenized field.
118 :
119 0 : OUString path = "#HLP#" + d_module + "/" + fileName;
120 0 : std::vector<TCHAR> aPath(OUStringToTCHARVec(path));
121 0 : doc->add(*_CLNEW Field(_T("path"), &aPath[0], Field::STORE_YES | Field::INDEX_UNTOKENIZED));
122 :
123 : OUString sEscapedFileName =
124 : rtl::Uri::encode(fileName,
125 0 : rtl_UriCharClassUric, rtl_UriEncodeIgnoreEscapes, RTL_TEXTENCODING_UTF8);
126 :
127 : // Add the caption as a field.
128 0 : OUString captionPath = d_captionDir + "/" + sEscapedFileName;
129 0 : doc->add(*_CLNEW Field(_T("caption"), helpFileReader(captionPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
130 :
131 : // Add the content as a field.
132 0 : OUString contentPath = d_contentDir + "/" + sEscapedFileName;
133 0 : doc->add(*_CLNEW Field(_T("content"), helpFileReader(contentPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
134 :
135 0 : return true;
136 : }
137 :
138 0 : lucene::util::Reader *HelpIndexer::helpFileReader(OUString const & path) {
139 0 : osl::File file(path);
140 0 : if (osl::FileBase::E_None == file.open(osl_File_OpenFlag_Read)) {
141 0 : file.close();
142 0 : OUString ustrSystemPath;
143 0 : osl::File::getSystemPathFromFileURL(path, ustrSystemPath);
144 0 : OString pathStr = OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
145 0 : return _CLNEW lucene::util::FileReader(pathStr.getStr(), "UTF-8");
146 : } else {
147 0 : return _CLNEW lucene::util::StringReader(L"");
148 0 : }
149 : }
150 :
151 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|