Kaydet (Commit) f0372076 authored tarafından László Németh's avatar László Németh Kaydeden (comit) Andras Timar

fix spell checking issues using recent Hunspell patches

Test: English word "Ian" are "item" are not allowed as "İan", "İtem" now.

Patch list with commit ids in Hunspell repository:

commit 66badb7449c2053c89456f11a7f71f3f5916b550
Extend dotless i and dotted I rules to Crimean Tatar language

commit 88cf975c295e3ec808efb77bb1a2a031d77f0c89
Allow dotted I in dictionary, and disable bad capitalization

commit 39b785a6b03b35cc8a27f43f6005dcaa432694e1
FORBIDDENWORD precedes BREAK

commit 0f691abe68788d0a58e72ab66877a9f670cd2741
Remove forbidden words from dash suggestion list

commit 15b2cde4f01706f0a648518a5cfc57394d015448
tdf#95024 fix compound handling for new Hungarian orthography

commit de3ae6844af62300e473f7b7b66a56e54153b4b9
fix compound word part "pa:"

Change-Id: Id12b5629b0c975464072b5b144743cbe40fe45a3
Reviewed-on: https://gerrit.libreoffice.org/44200Tested-by: 's avatarJenkins <ci@libreoffice.org>
Reviewed-by: 's avatarAndras Timar <andras.timar@collabora.com>
üst a7cd63df
From 88cf975c295e3ec808efb77bb1a2a031d77f0c89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?=
<laszlo.nemeth@collabora.com>
Date: Thu, 5 Oct 2017 12:24:02 +0200
Subject: [PATCH] Allow dotted I in dictionary, and disable bad capitalization
of i.
Dictionary words weren't recognized with dotted I, but dictionary
words with the letter i were recognized with dotted I, too.
---
src/hunspell/hunspell.cxx | 18 +++++++++++++-----
1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx
index 1ef11df..5c98f8a 100644
--- a/src/hunspell/hunspell.cxx
+++ b/src/hunspell/hunspell.cxx
@@ -562,11 +562,15 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
}
}
case INITCAP: {
-
+ // handle special capitalization of dotted I
+ bool Idot = (utf8 && (unsigned char) scw[0] == 0xc4 && (unsigned char) scw[1] == 0xb0);
*info += SPELL_ORIGCAP;
- mkallsmall2(scw, sunicw);
- std::string u8buffer(scw);
- mkinitcap2(scw, sunicw);
+ if (captype == ALLCAP) {
+ mkallsmall2(scw, sunicw);
+ mkinitcap2(scw, sunicw);
+ if (Idot)
+ scw.replace(0, 1, "\xc4\xb0");
+ }
if (captype == INITCAP)
*info += SPELL_INITCAP;
rv = checkword(scw, info, root);
@@ -581,9 +585,13 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
}
if (rv && is_keepcase(rv) && (captype == ALLCAP))
rv = NULL;
- if (rv)
+ if (rv || (Idot && langnum != LANG_az && langnum != LANG_tr && langnum != LANG_crh))
break;
+ mkallsmall2(scw, sunicw);
+ std::string u8buffer(scw);
+ mkinitcap2(scw, sunicw);
+
rv = checkword(u8buffer, info, root);
if (abbv && !rv) {
u8buffer.push_back('.');
--
1.9.1
From 66badb7449c2053c89456f11a7f71f3f5916b550 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?=
<laszlo.nemeth@collabora.com>
Date: Thu, 5 Oct 2017 11:13:28 +0200
Subject: [PATCH] Extend dotless i and dotted I rules to Crimean Tatar language
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
to support its special casing of ı/I, i/İ.
(Use
LANG crh
in the affix file to use this feature.)
---
src/hunspell/csutil.cxx | 5 +++--
src/hunspell/langnum.hxx | 1 +
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/src/hunspell/csutil.cxx b/src/hunspell/csutil.cxx
index df97b57..2980da7 100644
--- a/src/hunspell/csutil.cxx
+++ b/src/hunspell/csutil.cxx
@@ -2401,6 +2401,7 @@ static struct lang_map lang2enc[] =
{{"ar", LANG_ar}, {"az", LANG_az},
{"az_AZ", LANG_az}, // for back-compatibility
{"bg", LANG_bg}, {"ca", LANG_ca},
+ {"crh", LANG_crh},
{"cs", LANG_cs}, {"da", LANG_da},
{"de", LANG_de}, {"el", LANG_el},
{"en", LANG_en}, {"es", LANG_es},
@@ -2458,7 +2459,7 @@ unsigned short unicodetoupper(unsigned short c, int langnum) {
// In Azeri and Turkish, I and i dictinct letters:
// There are a dotless lower case i pair of upper `I',
// and an upper I with dot pair of lower `i'.
- if (c == 0x0069 && ((langnum == LANG_az) || (langnum == LANG_tr)))
+ if (c == 0x0069 && ((langnum == LANG_az) || (langnum == LANG_tr) || (langnum == LANG_crh)))
return 0x0130;
#ifdef OPENOFFICEORG
return static_cast<unsigned short>(u_toupper(c));
@@ -2475,7 +2476,7 @@ unsigned short unicodetolower(unsigned short c, int langnum) {
// In Azeri and Turkish, I and i dictinct letters:
// There are a dotless lower case i pair of upper `I',
// and an upper I with dot pair of lower `i'.
- if (c == 0x0049 && ((langnum == LANG_az) || (langnum == LANG_tr)))
+ if (c == 0x0049 && ((langnum == LANG_az) || (langnum == LANG_tr) || (langnum == LANG_crh)))
return 0x0131;
#ifdef OPENOFFICEORG
return static_cast<unsigned short>(u_tolower(c));
diff --git a/src/hunspell/langnum.hxx b/src/hunspell/langnum.hxx
index a64d3d7..f09de40 100644
--- a/src/hunspell/langnum.hxx
+++ b/src/hunspell/langnum.hxx
@@ -48,6 +48,7 @@ enum {
LANG_az = 100, // custom number
LANG_bg = 41,
LANG_ca = 37,
+ LANG_crh = 102, // custom number
LANG_cs = 42,
LANG_da = 45,
LANG_de = 49,
--
1.9.1
From 39b785a6b03b35cc8a27f43f6005dcaa432694e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
Date: Mon, 9 Oct 2017 13:02:39 +0200
Subject: [PATCH] FORBIDDENWORD precedes BREAK
Now it's possible to forbid compound forms recognized by
BREAK word breaking.
---
src/hunspell/hunspell.cxx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx
index 5c98f8a..3fd0d16 100644
--- a/src/hunspell/hunspell.cxx
+++ b/src/hunspell/hunspell.cxx
@@ -633,7 +633,7 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
}
// recursive breaking at break points
- if (!wordbreak.empty()) {
+ if (!wordbreak.empty() && !(*info & SPELL_FORBIDDEN)) {
int nbr = 0;
wl = scw.size();
--
1.9.1
From 0f691abe68788d0a58e72ab66877a9f670cd2741 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
Date: Tue, 10 Oct 2017 11:58:43 +0200
Subject: [PATCH] Remove forbidden words from dash suggestion list
---
src/hunspell/hunspell.cxx | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx
index 3fd0d16..76e61b1 100644
--- a/src/hunspell/hunspell.cxx
+++ b/src/hunspell/hunspell.cxx
@@ -1069,7 +1069,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
wspace.append("-");
wspace.append(scw.substr(dash_pos + 1));
}
- insert_sug(slst, wspace);
+ int info = 0;
+ if (pAMgr && pAMgr->get_forbiddenword())
+ checkword(wspace, &info, NULL);
+ if (!(info & SPELL_FORBIDDEN))
+ insert_sug(slst, wspace);
}
nodashsug = 0;
}
--
1.9.1
From 15b2cde4f01706f0a648518a5cfc57394d015448 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
Date: Thu, 12 Oct 2017 16:47:57 +0200
Subject: [PATCH] fix compound handling for new Hungarian orthography
Extend partial fix in commit 42807f970ac2d65f0d13a7c57eb454b210e92240.
---
src/hunspell/affixmgr.cxx | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx
index ffce7bb..ea0f0fc 100644
--- a/src/hunspell/affixmgr.cxx
+++ b/src/hunspell/affixmgr.cxx
@@ -1990,6 +1990,8 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
std::string tmp(sfxappnd);
reverseword(tmp);
numsyllable -= get_syllable(tmp) + sfxextra;
+ } else {
+ numsyllable -= sfxextra;
}
// + 1 word, if syllable number of the prefix > 1 (hungarian
@@ -2024,7 +2026,6 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
(TESTAFF(rv->astr, compoundroot, rv->alen))) {
wordnum++;
}
-
// second word is acceptable, as a word with prefix or/and suffix?
// hungarian conventions: compounding is acceptable,
// when compound forms consist 2 word, otherwise
@@ -2553,6 +2554,8 @@ int AffixMgr::compound_check_morph(const char* word,
std::string tmp(sfxappnd);
reverseword(tmp);
numsyllable -= get_syllable(tmp) + sfxextra;
+ } else {
+ numsyllable -= sfxextra;
}
// + 1 word, if syllable number of the prefix > 1 (hungarian
--
1.9.1
From de3ae6844af62300e473f7b7b66a56e54153b4b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
Date: Mon, 16 Oct 2017 23:00:23 +0200
Subject: [PATCH] fix compound word part "pa:"
(regression in morphological analysis)
---
src/hunspell/affixmgr.cxx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx
index ea0f0fc..52c7fa5 100644
--- a/src/hunspell/affixmgr.cxx
+++ b/src/hunspell/affixmgr.cxx
@@ -2608,7 +2608,7 @@ int AffixMgr::compound_check_morph(const char* word,
if (!m.empty()) {
result.push_back(MSEP_FLD);
result.append(MORPH_PART);
- result.append(word + 1);
+ result.append(word + i);
line_uniq_app(m, MSEP_REC);
result.append(m);
}
--
1.9.1
......@@ -21,6 +21,12 @@ $(eval $(call gb_UnpackedTarball_set_patchlevel,hunspell,1))
$(eval $(call gb_UnpackedTarball_add_patches,hunspell, \
external/hunspell/0001-Revert-Remove-autotools-autogenerated-files.patch \
external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch \
external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch \
external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch \
external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch \
external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch \
external/hunspell/0001-fix-compound-word-part-pa.patch \
))
# vim: set noet sw=4 ts=4:
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment