tdf#96197 do not break Korean words in the middle.

Korean words are composed of Hangul and are separated by space or newline. This patch improves line breaking function in CJK break iterator so that it does not break Korean words in the middle. It now breaks at the first character of the last Korean word. Change-Id: I91b20733c0c5ec4755bf68eb0d7c14c42c1f3556 Reviewed-on: https://gerrit.libreoffice.org/42987Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Eike Rathke <erack@redhat.com>

tdf#96197 do not break Korean words in the middle.
Korean words are composed of Hangul and are separated by space or newline. This patch improves line breaking function in CJK break iterator so that it does not break Korean words in the middle. It now breaks at the first character of the last Korean word. Change-Id: I91b20733c0c5ec4755bf68eb0d7c14c42c1f3556 Reviewed-on: https://gerrit.libreoffice.org/42987Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Eike Rathke <erack@redhat.com>
441fded7 · Mark Hung · Eike Rathke · 4791e606 · 441fded7 · 441fded7
Kaydet (Commit) 441fded7 authored Eki 01, 2017 tarafından Mark Hung Kaydeden (comit) Eike Rathke Eki 04, 2017
Hide whitespace changes
Inline Side-by-side

Showing with 35 additions and 0 deletions

test_breakiterator.cxx i18npool/qa/cppunit/test_breakiterator.cxx +16 -0

breakiterator_cjk.cxx i18npool/source/breakiterator/breakiterator_cjk.cxx +19 -0

No files found.
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -158,6 +158,22 @@ void TestBreakIterator::testLineBreaking()
            (void)m_xBreak->getLineBreak(aTest, 0, aLocale, 0, aHyphOptions, aUserOptions);
        }
    }
+
+    //See https://bugs.documentfoundation.org/show_bug.cgi?id=96197
+    {
+        const sal_Unicode HANGUL[] = { 0xc560, 0xad6D, 0xac00, 0xc758, 0x0020, 0xac00,
+                                       0xc0ac, 0xb294};
+        OUString aTest(HANGUL, SAL_N_ELEMENTS(HANGUL));
+
+        aLocale.Language = "ko";
+        aLocale.Country = "KR";
+
+        {
+            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
+                aHyphOptions, aUserOptions);
+            CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex);
+        }
+    }
 }

 //See https://bugs.libreoffice.org/show_bug.cgi?id=49629

--- a/i18npool/source/breakiterator/breakiterator_cjk.cxx
+++ b/i18npool/source/breakiterator/breakiterator_cjk.cxx
@@ -86,6 +86,8 @@ BreakIterator_CJK::getWordBoundary( const OUString& text, sal_Int32 anyPos,
    return BreakIterator_Unicode::getWordBoundary(text, anyPos, nLocale, wordType, bDirection);
 }

+#define isHangul(cCh) ((cCh>=0xAC00&&cCh<=0xD7AF)||(cCh>=0x1100&&cCh<=0x11FF))
+
 LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak(
        const OUString& Text, sal_Int32 nStartPos,
        const css::lang::Locale& /*rLocale*/, sal_Int32 /*nMinBreakPos*/,
@@ -94,17 +96,34 @@ LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak(
 {
    LineBreakResults lbr;

+    const sal_Int32 nOldStartPos = nStartPos;
+
    if (bOptions.allowPunctuationOutsideMargin &&
            hangingCharacters.indexOf(Text[nStartPos]) != -1 &&
            (Text.iterateCodePoints( &nStartPos ), nStartPos == Text.getLength())) {
        ; // do nothing
    } else if (bOptions.applyForbiddenRules && 0 < nStartPos && nStartPos < Text.getLength()) {
+
        while (nStartPos > 0 &&
                (bOptions.forbiddenBeginCharacters.indexOf(Text[nStartPos]) != -1 ||
                 bOptions.forbiddenEndCharacters.indexOf(Text[nStartPos-1]) != -1))
            Text.iterateCodePoints( &nStartPos, -1);
    }

+    // Prevent cutting Korean words in the middle.
+    if ( nOldStartPos == nStartPos && isHangul( Text[nStartPos] ) )
+    {
+        while ( nStartPos >= 0 && isHangul( Text[nStartPos] ) )
+            --nStartPos;
+
+        // beginning of the last Korean word.
+        if ( nStartPos < nOldStartPos )
+            ++nStartPos;
+
+        if ( nStartPos == 0 )
+            nStartPos = nOldStartPos;
+    }
+
    lbr.breakIndex = nStartPos;
    lbr.breakType = BreakType::WORDBOUNDARY;
    return lbr;