Kaydet (Commit) 01028864 authored tarafından Caolán McNamara's avatar Caolán McNamara

Resolves: fdo#45271, i#17964 count CJK words the way that's expected by users

üst 53600134
......@@ -43,6 +43,7 @@ struct SW_DLLPUBLIC SwDocStat
/// all paragraphs, including empty/hidden ones
sal_uLong nAllPara;
sal_uLong nWord;
sal_uLong nAsianWord;
sal_uLong nChar;
sal_uLong nCharExcludingSpaces;
sal_Bool bModified;
......
......@@ -173,6 +173,8 @@ class SW_DLLPUBLIC SwTxtNode: public SwCntntNode, public ::sfx2::Metadatable
//
SW_DLLPRIVATE void SetParaNumberOfWords( sal_uLong nTmpWords ) const;
SW_DLLPRIVATE sal_uLong GetParaNumberOfWords() const;
SW_DLLPRIVATE void SetParaNumberOfAsianWords( sal_uLong nTmpAsianWords ) const;
SW_DLLPRIVATE sal_uLong GetParaNumberOfAsianWords() const;
SW_DLLPRIVATE void SetParaNumberOfChars( sal_uLong nTmpChars ) const;
SW_DLLPRIVATE sal_uLong GetParaNumberOfChars() const;
SW_DLLPRIVATE void SetParaNumberOfCharsExcludingSpaces( sal_uLong nTmpChars ) const;
......
......@@ -113,7 +113,7 @@ void SwDocTest::testPageDescName()
CPPUNIT_ASSERT_MESSAGE("GetPageDescName results must be unique", aResults.size() == 3);
}
//See https://bugs.freedesktop.org/show_bug.cgi?id=32463 for motivation
//See https://bugs.freedesktop.org/show_bug.cgi?id=32463
void SwDocTest::testFileNameFields()
{
//Here's a file name with some chars in it that will be %% encoded, when expanding
......@@ -225,8 +225,8 @@ void SwDocTest::testSwScanner()
CPPUNIT_ASSERT_MESSAGE("Has Text Node", pTxtNode);
//See https://bugs.freedesktop.org/show_bug.cgi?id=40449 for motivation
//See https://bugs.freedesktop.org/show_bug.cgi?id=39365 for motivation
//See https://bugs.freedesktop.org/show_bug.cgi?id=40449
//See https://bugs.freedesktop.org/show_bug.cgi?id=39365
//Use a temporary rtl::OUString as the arg, as that's the trouble behind
//fdo#40449 and fdo#39365
{
......@@ -248,7 +248,7 @@ void SwDocTest::testSwScanner()
rWorld.equalsAsciiL(RTL_CONSTASCII_STRINGPARAM("World")));
}
//See https://www.libreoffice.org/bugzilla/show_bug.cgi?id=45271 for motivation
//See https://www.libreoffice.org/bugzilla/show_bug.cgi?id=45271
{
const sal_Unicode IDEOGRAPHICFULLSTOP_D[] = { 0x3002, 'D' };
......@@ -261,13 +261,51 @@ void SwDocTest::testSwScanner()
m_pDoc->InsertPoolItem(aPaM, aWestLangItem, 0 );
SwDocStat aDocStat;
pTxtNode = aPaM.GetNode()->GetTxtNode();
pTxtNode->CountWords(aDocStat, 0, SAL_N_ELEMENTS(IDEOGRAPHICFULLSTOP_D));
CPPUNIT_ASSERT_MESSAGE("Should be 2", aDocStat.nChar == 2);
CPPUNIT_ASSERT_MESSAGE("Should be 2", aDocStat.nCharExcludingSpaces == 2);
}
{
const sal_Unicode test[] =
{
0x3053, 0x306E, 0x65E5, 0x672C, 0x8A9E, 0x306F, 0x6B63, 0x3057,
0x304F, 0x6570, 0x3048, 0x3089, 0x308C, 0x308B, 0x3067, 0x3057,
0x3087, 0x3046, 0x304B, 0x3002, 0x0041, 0x006E, 0x0064, 0x0020,
0x006C, 0x0065, 0x0074, 0x0027, 0x0073, 0x0020, 0x0074, 0x0068,
0x0072, 0x006F, 0x0077, 0x0020, 0x0073, 0x006F, 0x006D, 0x0065,
0x0020, 0x0045, 0x006E, 0x0067, 0x006C, 0x0069, 0x0073, 0x0068,
0x0020, 0x0069, 0x006E, 0x0020, 0x0074, 0x006F, 0x0020, 0x006D,
0x0061, 0x006B, 0x0065, 0x0020, 0x0069, 0x0074, 0x0020, 0x0069,
0x006E, 0x0074, 0x0065, 0x0072, 0x0065, 0x0073, 0x0074, 0x0069,
0x006E, 0x0067, 0x002E, 0x0020, 0x0020, 0x305D, 0x3057, 0x3066,
0x3001, 0x307E, 0x305F, 0x65E5, 0x672C, 0x8A9E, 0x3000, 0x3000,
0x3067, 0x3082, 0x4ECA, 0x56DE, 0x306F, 0x7A7A, 0x767D, 0x3092,
0x3000, 0x3000, 0x5165, 0x308C, 0x307E, 0x3057, 0x305F, 0x3002,
0x0020, 0x0020, 0x0053, 0x006F, 0x0020, 0x0068, 0x006F, 0x0077,
0x0020, 0x0064, 0x006F, 0x0065, 0x0073, 0x0020, 0x0074, 0x0068,
0x0069, 0x0073, 0x0020, 0x0064, 0x006F, 0x003F, 0x0020, 0x0020
};
m_pDoc->AppendTxtNode(*aPaM.GetPoint());
m_pDoc->InsertString(aPaM, rtl::OUString(test,
SAL_N_ELEMENTS(test)));
SvxLanguageItem aCJKLangItem( LANGUAGE_JAPANESE, RES_CHRATR_CJK_LANGUAGE );
SvxLanguageItem aWestLangItem( LANGUAGE_ENGLISH_US, RES_CHRATR_LANGUAGE );
m_pDoc->InsertPoolItem(aPaM, aCJKLangItem, 0 );
m_pDoc->InsertPoolItem(aPaM, aWestLangItem, 0 );
SwDocStat aDocStat;
pTxtNode = aPaM.GetNode()->GetTxtNode();
pTxtNode->CountWords(aDocStat, 0, SAL_N_ELEMENTS(test));
CPPUNIT_ASSERT_MESSAGE("58 words", aDocStat.nWord == 58);
CPPUNIT_ASSERT_MESSAGE("43 Asian characters and Korean syllables", aDocStat.nAsianWord == 43);
CPPUNIT_ASSERT_MESSAGE("105 non-whitespace chars", aDocStat.nCharExcludingSpaces == 105);
CPPUNIT_ASSERT_MESSAGE("128 characters", aDocStat.nChar == 128);
}
//See https://issues.apache.org/ooo/show_bug.cgi?id=89042 for motivation
//See https://issues.apache.org/ooo/show_bug.cgi?id=89042
{
SwDocStat aDocStat;
......@@ -298,8 +336,7 @@ void SwDocTest::testSwScanner()
}
}
//See https://bugs.freedesktop.org/show_bug.cgi?id=40599 for motivation
//See https://bugs.freedesktop.org/show_bug.cgi?id=40599
void SwDocTest::testGraphicAnchorDeletion()
{
CPPUNIT_ASSERT_MESSAGE("Expected initial 0 count", m_pDoc->GetDocStat().nChar == 0);
......
......@@ -43,6 +43,7 @@ SwDocStat::SwDocStat() :
nPara(1),
nAllPara(1),
nWord(0),
nAsianWord(0),
nChar(0),
nCharExcludingSpaces(0),
bModified(sal_True)
......@@ -61,6 +62,7 @@ void SwDocStat::Reset()
nPara = 1;
nAllPara= 1;
nWord = 0;
nAsianWord = 0;
nChar = 0;
nCharExcludingSpaces = 0;
bModified = sal_True;
......
......@@ -680,6 +680,44 @@ SwScanner::SwScanner( const SwTxtNode& rNd, const rtl::OUString& rTxt,
}
}
namespace
{
//fdo#45271 for Asian words count characters instead of words
sal_Int32 forceEachAsianCodePointToWord(const rtl::OUString &rText, sal_Int32 nBegin, sal_Int32 nLen)
{
if (nLen > 1)
{
const uno::Reference< XBreakIterator > &rxBreak = pBreakIt->GetBreakIter();
sal_uInt16 nCurrScript = rxBreak->getScriptType( rText, nBegin );
sal_Int32 indexUtf16 = nBegin;
rText.iterateCodePoints(&indexUtf16, 1);
//First character is Asian, consider it a word :-(
if (nCurrScript == i18n::ScriptType::ASIAN)
{
nLen = indexUtf16 - nBegin;
return nLen;
}
//First character was not Asian, consider appearance of any Asian character
//to be the end of the word
while (indexUtf16 < nBegin + nLen)
{
nCurrScript = rxBreak->getScriptType( rText, indexUtf16 );
if (nCurrScript == i18n::ScriptType::ASIAN)
{
nLen = indexUtf16 - nBegin;
return nLen;
}
rText.iterateCodePoints(&indexUtf16, 1);
}
}
return nLen;
}
}
sal_Bool SwScanner::NextWord()
{
nBegin = nBegin + nLen;
......@@ -802,6 +840,9 @@ sal_Bool SwScanner::NextWord()
if( ! nLen )
return sal_False;
if ( nWordType == i18n::WordType::WORD_COUNT )
nLen = forceEachAsianCodePointToWord(aText, nBegin, nLen);
aWord = aText.copy( nBegin, nLen );
return sal_True;
......@@ -1812,6 +1853,7 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
{
// accumulate into DocStat record to return the values
rStat.nWord += GetParaNumberOfWords();
rStat.nAsianWord += GetParaNumberOfAsianWords();
rStat.nChar += GetParaNumberOfChars();
rStat.nCharExcludingSpaces += GetParaNumberOfCharsExcludingSpaces();
return;
......@@ -1842,7 +1884,8 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
// all counts exclude hidden paras and hidden+redlined within para
// definition of space/white chars in SwScanner (and BreakIter!)
// uses both lcl_IsSkippableWhiteSpace and BreakIter getWordBoundary in SwScanner
sal_uInt32 nTmpWords = 0; // count of all contiguous blocks of non-white chars
sal_uInt32 nTmpWords = 0; // count of all words
sal_uInt32 nTmpAsianWords = 0; //count of all Asian codepoints
sal_uInt32 nTmpChars = 0; // count of all chars
sal_uInt32 nTmpCharsExcludingSpaces = 0; // all non-white chars
......@@ -1862,7 +1905,10 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
if( 1 != aExpandText.match(aBreakWord, aScanner.GetBegin() ))
{
++nTmpWords;
nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(aScanner.GetWord());
const rtl::OUString &rWord = aScanner.GetWord();
if (pBreakIt->GetBreakIter()->getScriptType(rWord, 0) == i18n::ScriptType::ASIAN)
++nTmpAsianWords;
nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(rWord);
}
}
}
......@@ -1890,7 +1936,10 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
while ( aScanner.NextWord() )
{
++nTmpWords;
nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(aScanner.GetWord());
const rtl::OUString &rWord = aScanner.GetWord();
if (pBreakIt->GetBreakIter()->getScriptType(rWord, 0) == i18n::ScriptType::ASIAN)
++nTmpAsianWords;
nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(rWord);
}
nTmpChars = pBreakIt->getGraphemeCount(aNumString);
......@@ -1909,12 +1958,14 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
if ( isCountAll )
{
SetParaNumberOfWords( nTmpWords );
SetParaNumberOfAsianWords( nTmpAsianWords );
SetParaNumberOfChars( nTmpChars );
SetParaNumberOfCharsExcludingSpaces( nTmpCharsExcludingSpaces );
SetWordCountDirty( false );
}
// accumulate into DocStat record to return the values
rStat.nWord += nTmpWords;
rStat.nAsianWord += nTmpAsianWords;
rStat.nChar += nTmpChars;
rStat.nCharExcludingSpaces += nTmpCharsExcludingSpaces;
}
......@@ -1928,6 +1979,7 @@ struct SwParaIdleData_Impl
SwGrammarMarkUp* pGrammarCheck; // for grammar checking / proof reading
SwWrongList* pSmartTags;
sal_uLong nNumberOfWords;
sal_uLong nNumberOfAsianWords;
sal_uLong nNumberOfChars;
sal_uLong nNumberOfCharsExcludingSpaces;
bool bWordCountDirty;
......@@ -1941,6 +1993,7 @@ struct SwParaIdleData_Impl
pGrammarCheck ( 0 ),
pSmartTags ( 0 ),
nNumberOfWords ( 0 ),
nNumberOfAsianWords ( 0 ),
nNumberOfChars ( 0 ),
nNumberOfCharsExcludingSpaces ( 0 ),
bWordCountDirty ( true ),
......@@ -2033,10 +2086,25 @@ void SwTxtNode::SetParaNumberOfWords( sal_uLong nNew ) const
m_pParaIdleData_Impl->nNumberOfWords = nNew;
}
}
sal_uLong SwTxtNode::GetParaNumberOfWords() const
{
return m_pParaIdleData_Impl ? m_pParaIdleData_Impl->nNumberOfWords : 0;
}
void SwTxtNode::SetParaNumberOfAsianWords( sal_uLong nNew ) const
{
if ( m_pParaIdleData_Impl )
{
m_pParaIdleData_Impl->nNumberOfAsianWords = nNew;
}
}
sal_uLong SwTxtNode::GetParaNumberOfAsianWords() const
{
return m_pParaIdleData_Impl ? m_pParaIdleData_Impl->nNumberOfAsianWords : 0;
}
void SwTxtNode::SetParaNumberOfChars( sal_uLong nNew ) const
{
if ( m_pParaIdleData_Impl )
......@@ -2044,10 +2112,12 @@ void SwTxtNode::SetParaNumberOfChars( sal_uLong nNew ) const
m_pParaIdleData_Impl->nNumberOfChars = nNew;
}
}
sal_uLong SwTxtNode::GetParaNumberOfChars() const
{
return m_pParaIdleData_Impl ? m_pParaIdleData_Impl->nNumberOfChars : 0;
}
void SwTxtNode::SetWordCountDirty( bool bNew ) const
{
if ( m_pParaIdleData_Impl )
......
......@@ -42,6 +42,8 @@
#include <swwait.hxx>
#include <wrtsh.hxx>
//TODO, add asian/non-asian word count to UI when CJK mode is enabled.
SwWordCountDialog::SwWordCountDialog(Window* pParent) :
#if defined _MSC_VER
#pragma warning (disable : 4355)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment