|           Line data    Source code 
       1             : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2             : /*
       3             :  * Version: MPL 1.1 / GPLv3+ / LGPLv3+
       4             :  *
       5             :  * The contents of this file are subject to the Mozilla Public License Version
       6             :  * 1.1 (the "License"); you may not use this file except in compliance with
       7             :  * the License or as specified alternatively below. You may obtain a copy of
       8             :  * the License at http://www.mozilla.org/MPL/
       9             :  *
      10             :  * Software distributed under the License is distributed on an "AS IS" basis,
      11             :  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
      12             :  * for the specific language governing rights and limitations under the
      13             :  * License.
      14             :  *
      15             :  * Major Contributor(s):
      16             :  * Copyright (C) 2012 Gert van Valkenhoef <g.h.m.van.valkenhoef@rug.nl>
      17             :  *  (initial developer)
      18             :  *
      19             :  * All Rights Reserved.
      20             :  *
      21             :  * For minor contributions see the git repository.
      22             :  *
      23             :  * Alternatively, the contents of this file may be used under the terms of
      24             :  * either the GNU General Public License Version 3 or later (the "GPLv3+"), or
      25             :  * the GNU Lesser General Public License Version 3 or later (the "LGPLv3+"),
      26             :  * in which case the provisions of the GPLv3+ or the LGPLv3+ are applicable
      27             :  * instead of those above.
      28             :  */
      29             : 
      30             : #include <helpcompiler/HelpIndexer.hxx>
      31             : 
      32             : #include <rtl/string.hxx>
      33             : #include <rtl/uri.hxx>
      34             : #include <rtl/ustrbuf.hxx>
      35             : #include <osl/file.hxx>
      36             : #include <osl/thread.h>
      37             : #include <boost/scoped_ptr.hpp>
      38             : #include <algorithm>
      39             : 
      40             : #include "LuceneHelper.hxx"
      41             : 
      42             : using namespace lucene::document;
      43             : 
      44           9 : HelpIndexer::HelpIndexer(rtl::OUString const &lang, rtl::OUString const &module,
      45             :     rtl::OUString const &srcDir, rtl::OUString const &outDir)
      46           9 :     : d_lang(lang), d_module(module)
      47             : {
      48          18 :     d_indexDir = rtl::OUStringBuffer(outDir).append('/').
      49           9 :         append(module).appendAscii(RTL_CONSTASCII_STRINGPARAM(".idxl")).toString();
      50           9 :     d_captionDir = srcDir + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/caption"));
      51           9 :     d_contentDir = srcDir + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/content"));
      52           9 : }
      53             : 
      54           9 : bool HelpIndexer::indexDocuments()
      55             : {
      56           9 :     if (!scanForFiles())
      57           0 :         return false;
      58             : 
      59             :     try
      60             :     {
      61           9 :         rtl::OUString sLang = d_lang.getToken(0, '-');
      62           9 :         bool bUseCJK = sLang == "ja" || sLang == "ko" || sLang == "zh";
      63             : 
      64             :         // Construct the analyzer appropriate for the given language
      65           9 :         boost::scoped_ptr<lucene::analysis::Analyzer> analyzer;
      66           9 :         if (bUseCJK)
      67           0 :             analyzer.reset(new lucene::analysis::LanguageBasedAnalyzer(L"cjk"));
      68             :         else
      69           9 :             analyzer.reset(new lucene::analysis::standard::StandardAnalyzer());
      70             : 
      71           9 :         rtl::OUString ustrSystemPath;
      72           9 :         osl::File::getSystemPathFromFileURL(d_indexDir, ustrSystemPath);
      73             : 
      74           9 :         rtl::OString indexDirStr = rtl::OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
      75           9 :         lucene::index::IndexWriter writer(indexDirStr.getStr(), analyzer.get(), true);
      76             :         //Double limit of tokens allowed, otherwise we'll get a too-many-tokens
      77             :         //exception for ja help. Could alternative ignore the exception and get
      78             :         //truncated results as per java-Lucene apparently
      79           9 :         writer.setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH*2);
      80             : 
      81             :         // Index the identified help files
      82           9 :         Document doc;
      83        8915 :         for (std::set<rtl::OUString>::iterator i = d_files.begin(); i != d_files.end(); ++i) {
      84        8906 :             helpDocument(*i, &doc);
      85        8906 :             writer.addDocument(&doc);
      86        8906 :             doc.clear();
      87             :         }
      88           9 :         writer.optimize();
      89             : 
      90             :         // Optimize the index
      91           9 :         writer.optimize();
      92             :     }
      93           0 :     catch (CLuceneError &e)
      94             :     {
      95           0 :         d_error = rtl::OUString::createFromAscii(e.what());
      96           0 :         return false;
      97             :     }
      98             : 
      99           9 :     return true;
     100             : }
     101             : 
     102           0 : rtl::OUString const & HelpIndexer::getErrorMessage() {
     103           0 :     return d_error;
     104             : }
     105             : 
     106           9 : bool HelpIndexer::scanForFiles() {
     107           9 :     if (!scanForFiles(d_contentDir)) {
     108           0 :         return false;
     109             :     }
     110           9 :     if (!scanForFiles(d_captionDir)) {
     111           0 :         return false;
     112             :     }
     113           9 :     return true;
     114             : }
     115             : 
     116          18 : bool HelpIndexer::scanForFiles(rtl::OUString const & path) {
     117             : 
     118          18 :     osl::Directory dir(path);
     119          18 :     if (osl::FileBase::E_None != dir.open()) {
     120           2 :         d_error = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("Error reading directory ")) + path;
     121           2 :         return true;
     122             :     }
     123             : 
     124          16 :     osl::DirectoryItem item;
     125          16 :     osl::FileStatus fileStatus(osl_FileStatus_Mask_FileName | osl_FileStatus_Mask_Type);
     126       17317 :     while (dir.getNextItem(item) == osl::FileBase::E_None) {
     127       17285 :         item.getFileStatus(fileStatus);
     128       17285 :         if (fileStatus.getFileType() == osl::FileStatus::Regular) {
     129       17285 :             d_files.insert(fileStatus.getFileName());
     130             :         }
     131             :     }
     132             : 
     133          16 :     return true;
     134             : }
     135             : 
     136        8906 : bool HelpIndexer::helpDocument(rtl::OUString const & fileName, Document *doc) {
     137             :     // Add the help path as an indexed, untokenized field.
     138             : 
     139             :     rtl::OUString path = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("#HLP#")) +
     140        8906 :         d_module + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/")) + fileName;
     141        8906 :     std::vector<TCHAR> aPath(OUStringToTCHARVec(path));
     142        8906 :     doc->add(*_CLNEW Field(_T("path"), &aPath[0], Field::STORE_YES | Field::INDEX_UNTOKENIZED));
     143             : 
     144             :     rtl::OUString sEscapedFileName =
     145             :         rtl::Uri::encode(fileName,
     146        8906 :         rtl_UriCharClassUric, rtl_UriEncodeIgnoreEscapes, RTL_TEXTENCODING_UTF8);
     147             : 
     148             :     // Add the caption as a field.
     149        8906 :     rtl::OUString captionPath = d_captionDir + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/")) + sEscapedFileName;
     150        8906 :     doc->add(*_CLNEW Field(_T("caption"), helpFileReader(captionPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
     151             : 
     152             :     // Add the content as a field.
     153        8906 :     rtl::OUString contentPath = d_contentDir + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/")) + sEscapedFileName;
     154        8906 :     doc->add(*_CLNEW Field(_T("content"), helpFileReader(contentPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
     155             : 
     156        8906 :     return true;
     157             : }
     158             : 
     159       17812 : lucene::util::Reader *HelpIndexer::helpFileReader(rtl::OUString const & path) {
     160       17812 :     osl::File file(path);
     161       17812 :     if (osl::FileBase::E_None == file.open(osl_File_OpenFlag_Read)) {
     162       17285 :         file.close();
     163       17285 :         rtl::OUString ustrSystemPath;
     164       17285 :         osl::File::getSystemPathFromFileURL(path, ustrSystemPath);
     165       17285 :         rtl::OString pathStr = rtl::OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
     166       17285 :         return _CLNEW lucene::util::FileReader(pathStr.getStr(), "UTF-8");
     167             :     } else {
     168         527 :         return _CLNEW lucene::util::StringReader(L"");
     169       17812 :     }
     170             : }
     171             : 
     172             : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
 |