HelpIndexer.cxx 4.78 KB
Newer Older
1 2
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
3
 * This file is part of the LibreOffice project.
4
 *
5 6 7
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 9
 */

10
#include <helpcompiler/HelpIndexer.hxx>
Gert van Valkenhoef's avatar
Gert van Valkenhoef committed
11

12
#include <rtl/string.hxx>
13
#include <rtl/uri.hxx>
14
#include <o3tl/runtimetooustring.hxx>
15
#include <osl/file.hxx>
16
#include <osl/thread.h>
Caolán McNamara's avatar
Caolán McNamara committed
17
#include <memory>
Gert van Valkenhoef's avatar
Gert van Valkenhoef committed
18

19
#include "LuceneHelper.hxx"
20 21
#include <CLucene.h>
#include <CLucene/analysis/LanguageBasedAnalyzer.h>
22

Gert van Valkenhoef's avatar
Gert van Valkenhoef committed
23 24
using namespace lucene::document;

25 26
HelpIndexer::HelpIndexer(OUString const &lang, OUString const &module,
    OUString const &srcDir, OUString const &outDir)
27
    : d_lang(lang), d_module(module)
28
{
29
    d_indexDir = outDir + OUStringLiteral1('/') + module + ".idxl";
30 31
    d_captionDir = srcDir + "/caption";
    d_contentDir = srcDir + "/content";
32
}
Gert van Valkenhoef's avatar
Gert van Valkenhoef committed
33

34 35 36
bool HelpIndexer::indexDocuments()
{
    if (!scanForFiles())
37 38
        return false;

39 40
    try
    {
41
        OUString sLang = d_lang.getToken(0, '-');
42 43 44
        bool bUseCJK = sLang == "ja" || sLang == "ko" || sLang == "zh";

        // Construct the analyzer appropriate for the given language
Caolán McNamara's avatar
Caolán McNamara committed
45
        std::unique_ptr<lucene::analysis::Analyzer> analyzer;
46 47 48 49 50
        if (bUseCJK)
            analyzer.reset(new lucene::analysis::LanguageBasedAnalyzer(L"cjk"));
        else
            analyzer.reset(new lucene::analysis::standard::StandardAnalyzer());

51
        OUString ustrSystemPath;
52 53
        osl::File::getSystemPathFromFileURL(d_indexDir, ustrSystemPath);

54
        OString indexDirStr = OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
55 56 57 58 59 60 61 62
        lucene::index::IndexWriter writer(indexDirStr.getStr(), analyzer.get(), true);
        //Double limit of tokens allowed, otherwise we'll get a too-many-tokens
        //exception for ja help. Could alternative ignore the exception and get
        //truncated results as per java-Lucene apparently
        writer.setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH*2);

        // Index the identified help files
        Document doc;
63 64 65
        for (auto const& elem : d_files)
        {
            helpDocument(elem, &doc);
66 67 68 69 70 71 72 73 74 75
            writer.addDocument(&doc);
            doc.clear();
        }
        writer.optimize();

        // Optimize the index
        writer.optimize();
    }
    catch (CLuceneError &e)
    {
76
        d_error = o3tl::runtimeToOUString(e.what());
77
        return false;
78 79 80
    }

    return true;
Gert van Valkenhoef's avatar
Gert van Valkenhoef committed
81 82 83 84
}


bool HelpIndexer::scanForFiles() {
85 86 87 88 89 90 91
    if (!scanForFiles(d_contentDir)) {
        return false;
    }
    if (!scanForFiles(d_captionDir)) {
        return false;
    }
    return true;
Gert van Valkenhoef's avatar
Gert van Valkenhoef committed
92 93
}

94
bool HelpIndexer::scanForFiles(OUString const & path) {
95 96 97

    osl::Directory dir(path);
    if (osl::FileBase::E_None != dir.open()) {
98
        d_error = "Error reading directory " + path;
99
        return false;
100 101 102 103 104 105 106 107 108 109 110 111
    }

    osl::DirectoryItem item;
    osl::FileStatus fileStatus(osl_FileStatus_Mask_FileName | osl_FileStatus_Mask_Type);
    while (dir.getNextItem(item) == osl::FileBase::E_None) {
        item.getFileStatus(fileStatus);
        if (fileStatus.getFileType() == osl::FileStatus::Regular) {
            d_files.insert(fileStatus.getFileName());
        }
    }

    return true;
Gert van Valkenhoef's avatar
Gert van Valkenhoef committed
112 113
}

114
void HelpIndexer::helpDocument(OUString const & fileName, Document *doc) const {
115 116
    // Add the help path as an indexed, untokenized field.

117
    OUString path = "#HLP#" + d_module + "/" + fileName;
118
    std::vector<TCHAR> aPath(OUStringToTCHARVec(path));
Stephan Bergmann's avatar
Stephan Bergmann committed
119
    doc->add(*_CLNEW Field(_T("path"), aPath.data(), Field::STORE_YES | Field::INDEX_UNTOKENIZED));
120

121
    OUString sEscapedFileName =
122 123 124 125
        rtl::Uri::encode(fileName,
        rtl_UriCharClassUric, rtl_UriEncodeIgnoreEscapes, RTL_TEXTENCODING_UTF8);

    // Add the caption as a field.
126
    OUString captionPath = d_captionDir + "/" + sEscapedFileName;
127 128 129
    doc->add(*_CLNEW Field(_T("caption"), helpFileReader(captionPath), Field::STORE_NO | Field::INDEX_TOKENIZED));

    // Add the content as a field.
130
    OUString contentPath = d_contentDir + "/" + sEscapedFileName;
131
    doc->add(*_CLNEW Field(_T("content"), helpFileReader(contentPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
Gert van Valkenhoef's avatar
Gert van Valkenhoef committed
132 133
}

134
lucene::util::Reader *HelpIndexer::helpFileReader(OUString const & path) {
135 136 137
    osl::File file(path);
    if (osl::FileBase::E_None == file.open(osl_File_OpenFlag_Read)) {
        file.close();
138
        OUString ustrSystemPath;
139
        osl::File::getSystemPathFromFileURL(path, ustrSystemPath);
140
        OString pathStr = OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
141 142 143 144
        return _CLNEW lucene::util::FileReader(pathStr.getStr(), "UTF-8");
    } else {
        return _CLNEW lucene::util::StringReader(L"");
    }
Gert van Valkenhoef's avatar
Gert van Valkenhoef committed
145
}
146 147

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */