Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * Version: MPL 1.1 / GPLv3+ / LGPLv3+
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License or as specified alternatively below. You may obtain a copy of
8 : * the License at http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * Major Contributor(s):
16 : * Copyright (C) 2012 Gert van Valkenhoef <g.h.m.van.valkenhoef@rug.nl>
17 : * (initial developer)
18 : *
19 : * All Rights Reserved.
20 : *
21 : * For minor contributions see the git repository.
22 : *
23 : * Alternatively, the contents of this file may be used under the terms of
24 : * either the GNU General Public License Version 3 or later (the "GPLv3+"), or
25 : * the GNU Lesser General Public License Version 3 or later (the "LGPLv3+"),
26 : * in which case the provisions of the GPLv3+ or the LGPLv3+ are applicable
27 : * instead of those above.
28 : */
29 :
30 : #include <helpcompiler/HelpIndexer.hxx>
31 :
32 : #include <rtl/string.hxx>
33 : #include <rtl/uri.hxx>
34 : #include <rtl/ustrbuf.hxx>
35 : #include <osl/file.hxx>
36 : #include <osl/thread.h>
37 : #include <boost/scoped_ptr.hpp>
38 : #include <algorithm>
39 :
40 : #include "LuceneHelper.hxx"
41 :
42 : using namespace lucene::document;
43 :
44 9 : HelpIndexer::HelpIndexer(rtl::OUString const &lang, rtl::OUString const &module,
45 : rtl::OUString const &srcDir, rtl::OUString const &outDir)
46 9 : : d_lang(lang), d_module(module)
47 : {
48 18 : d_indexDir = rtl::OUStringBuffer(outDir).append('/').
49 9 : append(module).appendAscii(RTL_CONSTASCII_STRINGPARAM(".idxl")).toString();
50 9 : d_captionDir = srcDir + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/caption"));
51 9 : d_contentDir = srcDir + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/content"));
52 9 : }
53 :
54 9 : bool HelpIndexer::indexDocuments()
55 : {
56 9 : if (!scanForFiles())
57 0 : return false;
58 :
59 : try
60 : {
61 9 : rtl::OUString sLang = d_lang.getToken(0, '-');
62 9 : bool bUseCJK = sLang == "ja" || sLang == "ko" || sLang == "zh";
63 :
64 : // Construct the analyzer appropriate for the given language
65 9 : boost::scoped_ptr<lucene::analysis::Analyzer> analyzer;
66 9 : if (bUseCJK)
67 0 : analyzer.reset(new lucene::analysis::LanguageBasedAnalyzer(L"cjk"));
68 : else
69 9 : analyzer.reset(new lucene::analysis::standard::StandardAnalyzer());
70 :
71 9 : rtl::OUString ustrSystemPath;
72 9 : osl::File::getSystemPathFromFileURL(d_indexDir, ustrSystemPath);
73 :
74 9 : rtl::OString indexDirStr = rtl::OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
75 9 : lucene::index::IndexWriter writer(indexDirStr.getStr(), analyzer.get(), true);
76 : //Double limit of tokens allowed, otherwise we'll get a too-many-tokens
77 : //exception for ja help. Could alternative ignore the exception and get
78 : //truncated results as per java-Lucene apparently
79 9 : writer.setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH*2);
80 :
81 : // Index the identified help files
82 9 : Document doc;
83 8915 : for (std::set<rtl::OUString>::iterator i = d_files.begin(); i != d_files.end(); ++i) {
84 8906 : helpDocument(*i, &doc);
85 8906 : writer.addDocument(&doc);
86 8906 : doc.clear();
87 : }
88 9 : writer.optimize();
89 :
90 : // Optimize the index
91 9 : writer.optimize();
92 : }
93 0 : catch (CLuceneError &e)
94 : {
95 0 : d_error = rtl::OUString::createFromAscii(e.what());
96 0 : return false;
97 : }
98 :
99 9 : return true;
100 : }
101 :
102 0 : rtl::OUString const & HelpIndexer::getErrorMessage() {
103 0 : return d_error;
104 : }
105 :
106 9 : bool HelpIndexer::scanForFiles() {
107 9 : if (!scanForFiles(d_contentDir)) {
108 0 : return false;
109 : }
110 9 : if (!scanForFiles(d_captionDir)) {
111 0 : return false;
112 : }
113 9 : return true;
114 : }
115 :
116 18 : bool HelpIndexer::scanForFiles(rtl::OUString const & path) {
117 :
118 18 : osl::Directory dir(path);
119 18 : if (osl::FileBase::E_None != dir.open()) {
120 2 : d_error = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("Error reading directory ")) + path;
121 2 : return true;
122 : }
123 :
124 16 : osl::DirectoryItem item;
125 16 : osl::FileStatus fileStatus(osl_FileStatus_Mask_FileName | osl_FileStatus_Mask_Type);
126 17317 : while (dir.getNextItem(item) == osl::FileBase::E_None) {
127 17285 : item.getFileStatus(fileStatus);
128 17285 : if (fileStatus.getFileType() == osl::FileStatus::Regular) {
129 17285 : d_files.insert(fileStatus.getFileName());
130 : }
131 : }
132 :
133 16 : return true;
134 : }
135 :
136 8906 : bool HelpIndexer::helpDocument(rtl::OUString const & fileName, Document *doc) {
137 : // Add the help path as an indexed, untokenized field.
138 :
139 : rtl::OUString path = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("#HLP#")) +
140 8906 : d_module + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/")) + fileName;
141 8906 : std::vector<TCHAR> aPath(OUStringToTCHARVec(path));
142 8906 : doc->add(*_CLNEW Field(_T("path"), &aPath[0], Field::STORE_YES | Field::INDEX_UNTOKENIZED));
143 :
144 : rtl::OUString sEscapedFileName =
145 : rtl::Uri::encode(fileName,
146 8906 : rtl_UriCharClassUric, rtl_UriEncodeIgnoreEscapes, RTL_TEXTENCODING_UTF8);
147 :
148 : // Add the caption as a field.
149 8906 : rtl::OUString captionPath = d_captionDir + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/")) + sEscapedFileName;
150 8906 : doc->add(*_CLNEW Field(_T("caption"), helpFileReader(captionPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
151 :
152 : // Add the content as a field.
153 8906 : rtl::OUString contentPath = d_contentDir + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/")) + sEscapedFileName;
154 8906 : doc->add(*_CLNEW Field(_T("content"), helpFileReader(contentPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
155 :
156 8906 : return true;
157 : }
158 :
159 17812 : lucene::util::Reader *HelpIndexer::helpFileReader(rtl::OUString const & path) {
160 17812 : osl::File file(path);
161 17812 : if (osl::FileBase::E_None == file.open(osl_File_OpenFlag_Read)) {
162 17285 : file.close();
163 17285 : rtl::OUString ustrSystemPath;
164 17285 : osl::File::getSystemPathFromFileURL(path, ustrSystemPath);
165 17285 : rtl::OString pathStr = rtl::OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
166 17285 : return _CLNEW lucene::util::FileReader(pathStr.getStr(), "UTF-8");
167 : } else {
168 527 : return _CLNEW lucene::util::StringReader(L"");
169 17812 : }
170 : }
171 :
172 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|