Kaydet (Commit) 917bcea4 authored tarafından Caolán McNamara's avatar Caolán McNamara

bump hunspell to 1.6.2

Change-Id: I91d4d58f2b8ba69067de1d08476a8cebbb780535
Reviewed-on: https://gerrit.libreoffice.org/42555Tested-by: 's avatarJenkins <ci@libreoffice.org>
Reviewed-by: 's avatarCaolán McNamara <caolanm@redhat.com>
Tested-by: 's avatarCaolán McNamara <caolanm@redhat.com>
üst 3c543f87
......@@ -87,8 +87,8 @@ export HARFBUZZ_SHA256SUM := ccec4930ff0bb2d0c40aee203075447954b64a8c2695202413c
export HARFBUZZ_TARBALL := harfbuzz-1.4.8.tar.bz2
export HSQLDB_SHA256SUM := d30b13f4ba2e3b6a2d4f020c0dee0a9fb9fc6fbcc2d561f36b78da4bf3802370
export HSQLDB_TARBALL := 17410483b5b5f267aa18b7e00b65e6e0-hsqldb_1_8_0.zip
export HUNSPELL_SHA256SUM := 512e7d2ee69dad0b35ca011076405e56e0f10963a02d4859dbcc4faf53ca68e2
export HUNSPELL_TARBALL := 047c3feb121261b76dc16cdb62f54483-hunspell-1.6.0.tar.gz
export HUNSPELL_SHA256SUM := 3cd9ceb062fe5814f668e4f22b2fa6e3ba0b339b921739541ce180cac4d6f4c4
export HUNSPELL_TARBALL := hunspell-1.6.2.tar.gz
export HYPHEN_SHA256SUM := 304636d4eccd81a14b6914d07b84c79ebb815288c76fe027b9ebff6ff24d5705
export HYPHEN_TARBALL := 5ade6ae2a99bc1e9e57031ca88d36dad-hyphen-2.8.8.tar.gz
export ICU_SHA256SUM := 7132fdaf9379429d004005217f10e00b7d2319d0fea22bdfddef8991c45b75fe
......
From 9a0baf202f67291eaf482f1bcf654e21d71943e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caolanm@redhat.com>
Date: Mon, 23 Jan 2017 11:43:53 +0000
Subject: [PATCH] cppcheck: redundant c_str
---
src/hunspell/suggestmgr.cxx | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/hunspell/suggestmgr.cxx b/src/hunspell/suggestmgr.cxx
index b998341..8d46dd6 100644
--- a/src/hunspell/suggestmgr.cxx
+++ b/src/hunspell/suggestmgr.cxx
@@ -1107,7 +1107,7 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
int sc2;
if (utf8) {
w_f.clear();
- u8_u16(w_f, f.c_str());
+ u8_u16(w_f, f);
sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE + low) +
leftcommonsubstring(w_word, w_f);
} else {
@@ -1132,7 +1132,7 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
std::string target2 = phonet(candidate, *ph);
w_target2.clear();
if (utf8) {
- u8_u16(w_target2, target2.c_str());
+ u8_u16(w_target2, target2);
scphon = 2 * ngram(3, w_target, w_target2,
NGRAM_LONGER_WORSE);
} else {
--
2.9.3
From 93156ba9a8e644f8b0b724880668714adcb0d094 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caolanm@redhat.com>
Date: Mon, 23 Jan 2017 12:05:07 +0000
Subject: [PATCH] cppcheck: rv is reassigned before old value used
---
src/hunspell/affixmgr.cxx | 6 ++----
src/hunspell/suggestmgr.cxx | 3 +--
2 files changed, 3 insertions(+), 6 deletions(-)
diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx
index 680cbe9..21cf384 100644
--- a/src/hunspell/affixmgr.cxx
+++ b/src/hunspell/affixmgr.cxx
@@ -1494,9 +1494,8 @@ int AffixMgr::defcpd_check(hentry*** words,
}
inline int AffixMgr::candidate_check(const char* word, int len) {
- struct hentry* rv = NULL;
- rv = lookup(word);
+ struct hentry* rv = lookup(word);
if (rv)
return 1;
@@ -3045,10 +3044,9 @@ struct hentry* AffixMgr::affix_check(const char* word,
int len,
const FLAG needflag,
char in_compound) {
- struct hentry* rv = NULL;
// check all prefixes (also crossed with suffixes if allowed)
- rv = prefix_check(word, len, in_compound, needflag);
+ struct hentry* rv = prefix_check(word, len, in_compound, needflag);
if (rv)
return rv;
diff --git a/src/hunspell/suggestmgr.cxx b/src/hunspell/suggestmgr.cxx
index 8d46dd6..54a474f 100644
--- a/src/hunspell/suggestmgr.cxx
+++ b/src/hunspell/suggestmgr.cxx
@@ -1675,11 +1675,10 @@ std::string SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) {
if (HENTRY_DATA(rv))
p = (char*)strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH);
while (p) {
- struct hentry* rv2 = NULL;
p += MORPH_TAG_LEN;
int plen = fieldlen(p);
std::string allomorph(p, plen);
- rv2 = pAMgr->lookup(allomorph.c_str());
+ struct hentry* rv2 = pAMgr->lookup(allomorph.c_str());
while (rv2) {
// if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <=
// sfxcount) {
--
2.9.3
From f366e97fa8d7ad21060033b733dda15299edf7c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caolanm@redhat.com>
Date: Fri, 10 Feb 2017 15:37:11 +0000
Subject: [PATCH 1/4] loop via iterators
---
src/hunspell/csutil.cxx | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/src/hunspell/csutil.cxx b/src/hunspell/csutil.cxx
index c1666a5..2408677 100644
--- a/src/hunspell/csutil.cxx
+++ b/src/hunspell/csutil.cxx
@@ -2537,13 +2537,17 @@ int get_captype_utf8(const std::vector<w_char>& word, int langnum) {
size_t ncap = 0;
size_t nneutral = 0;
size_t firstcap = 0;
- for (size_t i = 0; i < word.size(); ++i) {
- unsigned short idx = (word[i].h << 8) + word[i].l;
+
+ std::vector<w_char>::const_iterator it = word.begin();
+ std::vector<w_char>::const_iterator it_end = word.end();
+ while (it != it_end) {
+ unsigned short idx = (it->h << 8) + it->l;
unsigned short lwridx = unicodetolower(idx, langnum);
if (idx != lwridx)
ncap++;
if (unicodetoupper(idx, langnum) == lwridx)
nneutral++;
+ ++it;
}
if (ncap) {
unsigned short idx = (word[0].h << 8) + word[0].l;
--
2.9.3
From bf05e232805f6c1fae5dea3c223de8bdaab425e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caolanm@redhat.com>
Date: Mon, 23 Jan 2017 13:26:53 +0000
Subject: [PATCH 1/3] unroll this a bit
---
src/hunspell/csutil.cxx | 49 ++++++++++++++++++++++++++++---------------------
1 file changed, 28 insertions(+), 21 deletions(-)
diff --git a/src/hunspell/csutil.cxx b/src/hunspell/csutil.cxx
index ac5cd98..c1666a5 100644
--- a/src/hunspell/csutil.cxx
+++ b/src/hunspell/csutil.cxx
@@ -518,18 +518,20 @@ unsigned char ccase(const struct cs_info* csconv, int nIndex) {
w_char upper_utf(w_char u, int langnum) {
unsigned short idx = (u.h << 8) + u.l;
- if (idx != unicodetoupper(idx, langnum)) {
- u.h = (unsigned char)(unicodetoupper(idx, langnum) >> 8);
- u.l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF);
+ unsigned short upridx = unicodetoupper(idx, langnum);
+ if (idx != upridx) {
+ u.h = (unsigned char)(upridx >> 8);
+ u.l = (unsigned char)(upridx & 0x00FF);
}
return u;
}
w_char lower_utf(w_char u, int langnum) {
unsigned short idx = (u.h << 8) + u.l;
- if (idx != unicodetolower(idx, langnum)) {
- u.h = (unsigned char)(unicodetolower(idx, langnum) >> 8);
- u.l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF);
+ unsigned short lwridx = unicodetolower(idx, langnum);
+ if (idx != lwridx) {
+ u.h = (unsigned char)(lwridx >> 8);
+ u.l = (unsigned char)(lwridx & 0x00FF);
}
return u;
}
@@ -551,12 +553,13 @@ std::string& mkallsmall(std::string& s, const struct cs_info* csconv) {
}
std::vector<w_char>& mkallsmall_utf(std::vector<w_char>& u,
- int langnum) {
+ int langnum) {
for (size_t i = 0; i < u.size(); ++i) {
unsigned short idx = (u[i].h << 8) + u[i].l;
- if (idx != unicodetolower(idx, langnum)) {
- u[i].h = (unsigned char)(unicodetolower(idx, langnum) >> 8);
- u[i].l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF);
+ unsigned short lwridx = unicodetolower(idx, langnum);
+ if (idx != lwridx) {
+ u[i].h = (unsigned char)(lwridx >> 8);
+ u[i].l = (unsigned char)(lwridx & 0x00FF);
}
}
return u;
@@ -565,9 +568,10 @@ std::vector<w_char>& mkallsmall_utf(std::vector<w_char>& u,
std::vector<w_char>& mkallcap_utf(std::vector<w_char>& u, int langnum) {
for (size_t i = 0; i < u.size(); i++) {
unsigned short idx = (u[i].h << 8) + u[i].l;
- if (idx != unicodetoupper(idx, langnum)) {
- u[i].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8);
- u[i].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF);
+ unsigned short upridx = unicodetoupper(idx, langnum);
+ if (idx != upridx) {
+ u[i].h = (unsigned char)(upridx >> 8);
+ u[i].l = (unsigned char)(upridx & 0x00FF);
}
}
return u;
@@ -583,9 +587,10 @@ std::string& mkinitcap(std::string& s, const struct cs_info* csconv) {
std::vector<w_char>& mkinitcap_utf(std::vector<w_char>& u, int langnum) {
if (!u.empty()) {
unsigned short idx = (u[0].h << 8) + u[0].l;
- if (idx != unicodetoupper(idx, langnum)) {
- u[0].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8);
- u[0].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF);
+ unsigned short upridx = unicodetoupper(idx, langnum);
+ if (idx != upridx) {
+ u[0].h = (unsigned char)(upridx >> 8);
+ u[0].l = (unsigned char)(upridx & 0x00FF);
}
}
return u;
@@ -601,9 +606,10 @@ std::string& mkinitsmall(std::string& s, const struct cs_info* csconv) {
std::vector<w_char>& mkinitsmall_utf(std::vector<w_char>& u, int langnum) {
if (!u.empty()) {
unsigned short idx = (u[0].h << 8) + u[0].l;
- if (idx != unicodetolower(idx, langnum)) {
- u[0].h = (unsigned char)(unicodetolower(idx, langnum) >> 8);
- u[0].l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF);
+ unsigned short lwridx = unicodetolower(idx, langnum);
+ if (idx != lwridx) {
+ u[0].h = (unsigned char)(lwridx >> 8);
+ u[0].l = (unsigned char)(lwridx & 0x00FF);
}
}
return u;
@@ -2533,9 +2539,10 @@ int get_captype_utf8(const std::vector<w_char>& word, int langnum) {
size_t firstcap = 0;
for (size_t i = 0; i < word.size(); ++i) {
unsigned short idx = (word[i].h << 8) + word[i].l;
- if (idx != unicodetolower(idx, langnum))
+ unsigned short lwridx = unicodetolower(idx, langnum);
+ if (idx != lwridx)
ncap++;
- if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum))
+ if (unicodetoupper(idx, langnum) == lwridx)
nneutral++;
}
if (ncap) {
--
2.9.3
From 8e957585671c76fa21e6265ec7b68aa19507f4fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caolanm@redhat.com>
Date: Fri, 10 Feb 2017 15:49:17 +0000
Subject: [PATCH 2/4] add a get_clen_and_captype varient that takes a buffer
kcachegrind reports 1,057,506,901 -> 830,529,143 on
echo Hollo | valgrind --tool=callgrind ./src/tools/.libs/hunspell -d nl_NL
---
src/hunspell/hashmgr.cxx | 16 +++++++++++-----
src/hunspell/hashmgr.hxx | 1 +
2 files changed, 12 insertions(+), 5 deletions(-)
diff --git a/src/hunspell/hashmgr.cxx b/src/hunspell/hashmgr.cxx
index 1de1690..4844b49 100644
--- a/src/hunspell/hashmgr.cxx
+++ b/src/hunspell/hashmgr.cxx
@@ -363,12 +363,11 @@ int HashMgr::add_hidden_capitalized_word(const std::string& word,
}
// detect captype and modify word length for UTF-8 encoding
-int HashMgr::get_clen_and_captype(const std::string& word, int* captype) {
+int HashMgr::get_clen_and_captype(const std::string& word, int* captype, std::vector<w_char> &workbuf) {
int len;
if (utf8) {
- std::vector<w_char> dest_utf;
- len = u8_u16(dest_utf, word);
- *captype = get_captype_utf8(dest_utf, langnum);
+ len = u8_u16(workbuf, word);
+ *captype = get_captype_utf8(workbuf, langnum);
} else {
len = word.size();
*captype = get_captype(word, csconv);
@@ -376,6 +375,11 @@ int HashMgr::get_clen_and_captype(const std::string& word, int* captype) {
return len;
}
+int HashMgr::get_clen_and_captype(const std::string& word, int* captype) {
+ std::vector<w_char> workbuf;
+ return get_clen_and_captype(word, captype, workbuf);
+}
+
// remove word (personal dictionary function for standalone applications)
int HashMgr::remove(const std::string& word) {
struct hentry* dp = lookup(word.c_str());
@@ -527,6 +531,8 @@ int HashMgr::load_tables(const char* tpath, const char* key) {
// loop through all words on much list and add to hash
// table and create word and affix strings
+ std::vector<w_char> workbuf;
+
while (dict->getline(ts)) {
mychomp(ts);
// split each line into word and morphological description
@@ -599,7 +605,7 @@ int HashMgr::load_tables(const char* tpath, const char* key) {
}
int captype;
- int wcl = get_clen_and_captype(ts, &captype);
+ int wcl = get_clen_and_captype(ts, &captype, workbuf);
const std::string *dp_str = dp.empty() ? NULL : &dp;
// add the word and its index plus its capitalized form optionally
if (add_word(ts, wcl, flags, al, dp_str, false) ||
diff --git a/src/hunspell/hashmgr.hxx b/src/hunspell/hashmgr.hxx
index 812171a..5a09c45 100644
--- a/src/hunspell/hashmgr.hxx
+++ b/src/hunspell/hashmgr.hxx
@@ -125,6 +125,7 @@ class HashMgr {
private:
int get_clen_and_captype(const std::string& word, int* captype);
+ int get_clen_and_captype(const std::string& word, int* captype, std::vector<w_char> &workbuf);
int load_tables(const char* tpath, const char* key);
int add_word(const std::string& word,
int wcl,
--
2.9.3
From 1fada01663b29b57c010a9c274e45a5cf9ecf222 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?=
<laszlo.nemeth@collabora.com>
Date: Sun, 19 Mar 2017 13:19:29 +0100
Subject: [PATCH 2/7] fix other regression in compounding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Allow compound words again with
starting "kor", "alak", "asszony", "úr"
related to the "REP kor _kor" etc. rules
using the Hungarian spelling dictionary.
regression from...
commit 73b1cad1af7ab94252f75784fa6724cf062a6966
Author: Martin Hosken <martin_hosken@sil.org>
Date: Mon Apr 18 16:28:26 2016 +0700
Add support for bounded conversion
---
src/hunspell/affixmgr.cxx | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx
index 78c70e7..ec2093d 100644
--- a/src/hunspell/affixmgr.cxx
+++ b/src/hunspell/affixmgr.cxx
@@ -1290,8 +1290,8 @@ int AffixMgr::cpdrep_check(const char* word, int wl) {
// search every occurence of the pattern in the word
while ((r = strstr(r, reptable[i].pattern.c_str())) != NULL) {
std::string candidate(word);
- size_t type = r == word ? 1 : 0;
- if (r - word + reptable[i].pattern.size() == lenp)
+ size_t type = r == word && langnum != LANG_hu ? 1 : 0;
+ if (r - word + reptable[i].pattern.size() == lenp && langnum != LANG_hu)
type += 2;
candidate.replace(r - word, lenp, reptable[i].outstrings[type]);
if (candidate_check(candidate.c_str(), candidate.size()))
--
2.7.4
From cf0967951a25a2daa10a636092193af5c5497aa2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caolanm@redhat.com>
Date: Fri, 10 Feb 2017 16:36:27 +0000
Subject: [PATCH 3/4] hoist string lowering from ngram to ngsuggest
only lower when we have to and reuse scratch buffers as
tolower destination
kcachegrind reports 830,529,143 -> 779,887,690 on
echo Hollo | valgrind --tool=callgrind ./src/tools/.libs/hunspell -d nl_NL
---
src/hunspell/suggestmgr.cxx | 143 +++++++++++++++++++++++++++++---------------
1 file changed, 95 insertions(+), 48 deletions(-)
diff --git a/src/hunspell/suggestmgr.cxx b/src/hunspell/suggestmgr.cxx
index 54a474f..ea52707 100644
--- a/src/hunspell/suggestmgr.cxx
+++ b/src/hunspell/suggestmgr.cxx
@@ -1075,10 +1075,8 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
u8_u16(w_target, target);
}
- std::vector<w_char> w_entry;
std::string f;
std::vector<w_char> w_f;
- std::vector<w_char> w_target2;
for (size_t i = 0; i < rHMgr.size(); ++i) {
while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) {
@@ -1091,13 +1089,24 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
continue;
if (utf8) {
- w_entry.clear();
- u8_u16(w_entry, HENTRY_WORD(hp));
- sc = ngram(3, w_word, w_entry, NGRAM_LONGER_WORSE + low) +
- leftcommonsubstring(w_word, w_entry);
+ w_f.clear();
+ u8_u16(w_f, HENTRY_WORD(hp));
+
+ int leftcommon = leftcommonsubstring(w_word, w_f);
+ if (low) {
+ // lowering dictionary word
+ mkallsmall_utf(w_f, langnum);
+ }
+ sc = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE) + leftcommon;
} else {
- sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +
- leftcommonsubstring(word, HENTRY_WORD(hp));
+ f.assign(HENTRY_WORD(hp));
+
+ int leftcommon = leftcommonsubstring(word, f.c_str());
+ if (low) {
+ // lowering dictionary word
+ mkallsmall(f, csconv);
+ }
+ sc = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon;
}
// check special pronounciation
@@ -1108,11 +1117,20 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
if (utf8) {
w_f.clear();
u8_u16(w_f, f);
- sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE + low) +
- leftcommonsubstring(w_word, w_f);
+
+ int leftcommon = leftcommonsubstring(w_word, w_f);
+ if (low) {
+ // lowering dictionary word
+ mkallsmall_utf(w_f, langnum);
+ }
+ sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE) + leftcommon;
} else {
- sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +
- leftcommonsubstring(word, f.c_str());
+ int leftcommon = leftcommonsubstring(word, f.c_str());
+ if (low) {
+ // lowering dictionary word
+ mkallsmall(f, csconv);
+ }
+ sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon;
}
if (sc2 > sc)
sc = sc2;
@@ -1129,14 +1147,14 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
candidate = HENTRY_WORD(hp);
mkallcap(candidate, csconv);
}
- std::string target2 = phonet(candidate, *ph);
- w_target2.clear();
+ f = phonet(candidate, *ph);
+ w_f.clear();
if (utf8) {
- u8_u16(w_target2, target2);
- scphon = 2 * ngram(3, w_target, w_target2,
+ u8_u16(w_f, f);
+ scphon = 2 * ngram(3, w_target, w_f,
NGRAM_LONGER_WORSE);
} else {
- scphon = 2 * ngram(3, target, target2,
+ scphon = 2 * ngram(3, target, f,
NGRAM_LONGER_WORSE);
}
}
@@ -1177,12 +1195,24 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
w_mw[k].l = '*';
w_mw[k].h = 0;
}
- thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH + low);
+
+ if (low) {
+ // lowering dictionary word
+ mkallsmall_utf(w_mw, langnum);
+ }
+
+ thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH);
} else {
std::string mw = word;
for (int k = sp; k < n; k += 4)
mw[k] = '*';
- thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
+
+ if (low) {
+ // lowering dictionary word
+ mkallsmall(mw, csconv);
+ }
+
+ thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH);
}
}
thresh = thresh / 3;
@@ -1210,7 +1240,6 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
return;
}
- std::vector<w_char> w_glst_word;
for (int i = 0; i < MAX_ROOTS; i++) {
if (roots[i]) {
struct hentry* rp = roots[i];
@@ -1225,15 +1254,26 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
for (int k = 0; k < nw; k++) {
if (utf8) {
- w_glst_word.clear();
- u8_u16(w_glst_word, glst[k].word);
- sc = ngram(n, w_word, w_glst_word,
- NGRAM_ANY_MISMATCH + low) +
- leftcommonsubstring(w_word, w_glst_word);
+ w_f.clear();
+ u8_u16(w_f, glst[k].word);
+
+ int leftcommon = leftcommonsubstring(w_word, w_f);
+ if (low) {
+ // lowering dictionary word
+ mkallsmall_utf(w_f, langnum);
+ }
+
+ sc = ngram(n, w_word, w_f, NGRAM_ANY_MISMATCH) + leftcommon;
} else {
- sc = ngram(n, word, glst[k].word,
- NGRAM_ANY_MISMATCH + low) +
- leftcommonsubstring(word, glst[k].word);
+ f = glst[k].word;
+
+ int leftcommon = leftcommonsubstring(word, f.c_str());
+ if (low) {
+ // lowering dictionary word
+ mkallsmall(f, csconv);
+ }
+
+ sc = ngram(n, word, f, NGRAM_ANY_MISMATCH) + leftcommon;
}
if (sc > thresh) {
@@ -1318,19 +1358,37 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
w_gl.clear();
if (utf8) {
u8_u16(w_gl, gl);
- re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
- ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
+ //w_gl is lowercase already at this point
+ re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
+ if (low) {
+ w_f = w_word;
+ // lowering dictionary word
+ mkallsmall_utf(w_f, langnum);
+ re += ngram(2, w_gl, w_f, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
+ } else {
+ re += ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
+ }
} else {
- re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
- ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
+ //gl is lowercase already at this point
+ re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
+ if (low) {
+ f = word;
+ // lowering dictionary word
+ mkallsmall(f, csconv);
+ re += ngram(2, gl, f, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
+ } else {
+ re += ngram(2, gl, word, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
+ }
}
int ngram_score, leftcommon_score;
if (utf8) {
- ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH + low);
+ //w_gl is lowercase already at this point
+ ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH);
leftcommon_score = leftcommonsubstring(w_word, w_gl);
} else {
- ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH + low);
+ //gl is lowercase already at this point
+ ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH);
leftcommon_score = leftcommonsubstring(word, gl.c_str());
}
gscore[i] =
@@ -1802,14 +1860,6 @@ int SuggestMgr::ngram(int n,
l2 = su2.size();
if (l2 == 0)
return 0;
- // lowering dictionary word
- const std::vector<w_char>* p_su2 = &su2;
- std::vector<w_char> su2_copy;
- if (opt & NGRAM_LOWERING) {
- su2_copy = su2;
- mkallsmall_utf(su2_copy, langnum);
- p_su2 = &su2_copy;
- }
for (int j = 1; j <= n; j++) {
ns = 0;
for (int i = 0; i <= (l1 - j); i++) {
@@ -1817,7 +1867,7 @@ int SuggestMgr::ngram(int n,
for (int l = 0; l <= (l2 - j); l++) {
for (k = 0; k < j; k++) {
const w_char& c1 = su1[i + k];
- const w_char& c2 = (*p_su2)[l + k];
+ const w_char& c2 = su2[l + k];
if ((c1.l != c2.l) || (c1.h != c2.h))
break;
}
@@ -1862,14 +1912,11 @@ int SuggestMgr::ngram(int n,
if (l2 == 0)
return 0;
l1 = s1.size();
- std::string t(s2);
- if (opt & NGRAM_LOWERING)
- mkallsmall(t, csconv);
for (int j = 1; j <= n; j++) {
ns = 0;
for (int i = 0; i <= (l1 - j); i++) {
- //t is haystack, s1[i..i+j) is needle
- if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {
+ //s2 is haystack, s1[i..i+j) is needle
+ if (s2.find(s1.c_str()+i, 0, j) != std::string::npos) {
ns++;
} else if (opt & NGRAM_WEIGHTED) {
ns--;
--
2.9.3
From aab258adbd9c78931a36b96e58975a08000249a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caolanm@redhat.com>
Date: Fri, 10 Feb 2017 17:14:35 +0000
Subject: [PATCH 4/4] either clear will be called anyway before use, or its
unused afterwards
---
src/hunspell/suggestmgr.cxx | 8 --------
1 file changed, 8 deletions(-)
diff --git a/src/hunspell/suggestmgr.cxx b/src/hunspell/suggestmgr.cxx
index ea52707..ae34535 100644
--- a/src/hunspell/suggestmgr.cxx
+++ b/src/hunspell/suggestmgr.cxx
@@ -1089,7 +1089,6 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
continue;
if (utf8) {
- w_f.clear();
u8_u16(w_f, HENTRY_WORD(hp));
int leftcommon = leftcommonsubstring(w_word, w_f);
@@ -1115,7 +1114,6 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
int sc2;
if (utf8) {
- w_f.clear();
u8_u16(w_f, f);
int leftcommon = leftcommonsubstring(w_word, w_f);
@@ -1139,7 +1137,6 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
int scphon = -20000;
if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) {
if (utf8) {
- w_candidate.clear();
u8_u16(w_candidate, HENTRY_WORD(hp));
mkallcap_utf(w_candidate, langnum);
u16_u8(candidate, w_candidate);
@@ -1148,7 +1145,6 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
mkallcap(candidate, csconv);
}
f = phonet(candidate, *ph);
- w_f.clear();
if (utf8) {
u8_u16(w_f, f);
scphon = 2 * ngram(3, w_target, w_f,
@@ -1254,7 +1250,6 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
for (int k = 0; k < nw; k++) {
if (utf8) {
- w_f.clear();
u8_u16(w_f, glst[k].word);
int leftcommon = leftcommonsubstring(w_word, w_f);
@@ -1335,7 +1330,6 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
std::string gl;
int len;
if (utf8) {
- w_gl.clear();
len = u8_u16(w_gl, guess[i]);
mkallsmall_utf(w_gl, langnum);
u16_u8(gl, w_gl);
@@ -1355,7 +1349,6 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
}
// using 2-gram instead of 3, and other weightening
- w_gl.clear();
if (utf8) {
u8_u16(w_gl, gl);
//w_gl is lowercase already at this point
@@ -1421,7 +1414,6 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
// lowering rootphon[i]
std::string gl;
int len;
- w_gl.clear();
if (utf8) {
len = u8_u16(w_gl, rootsphon[i]);
mkallsmall_utf(w_gl, langnum);
--
2.9.3
From f4ec6a283f972c82d068f4472320d424c40d45cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?=
<laszlo.nemeth@collabora.com>
Date: Thu, 23 Mar 2017 16:40:52 +0100
Subject: [PATCH 5/7] fix syllable counting in compound word handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Note: one of the fixed regressions is related to an old
hidden mistake: using clen instead of blen of the stem
word lengths was indifferent with the original get_syllable(),
because blen == clen at 8-bit encodings, and UTF-8
words were handled by null-termination. Implementing Unicode
support in Hunspell, clen was changed only in
compound_check_morph() to blen accidentally, but not
in compound_check(), resulting problems from the
recent std::string conversion.
Now this commit is a real fix for the regression from the
commit c63c93237e4decdba5544a96093448605ac549c2,
instead of the following bad fix:
commit d06b0c57ae87ee8743f1bf53f80c1f8e364db619
Author: László Németh <laszlo.nemeth@collabora.com>
Date: Fri Mar 17 15:11:23 2017 +0100
fix Hungarian compound word handling
---
src/hunspell/affixmgr.cxx | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx
index 2ed8233..3d65539 100644
--- a/src/hunspell/affixmgr.cxx
+++ b/src/hunspell/affixmgr.cxx
@@ -1816,7 +1816,7 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
// LANG_hu section: spec. Hungarian rule
if (langnum == LANG_hu) {
// calculate syllable number of the word
- numsyllable += get_syllable(st.substr(i));
+ numsyllable += get_syllable(st.substr(0, i));
// + 1 word, if syllable number of the prefix > 1 (hungarian
// convention)
if (pfx && (get_syllable(pfx->getKey()) > 1))
@@ -1901,7 +1901,7 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
(compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
(((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
((cpdmaxsyllable != 0) &&
- (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->clen)) <=
+ (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
cpdmaxsyllable))) &&
(
// test CHECKCOMPOUNDPATTERN
@@ -2382,7 +2382,7 @@ int AffixMgr::compound_check_morph(const char* word,
// LANG_hu section: spec. Hungarian rule
if (langnum == LANG_hu) {
// calculate syllable number of the word
- numsyllable += get_syllable(st.substr(i));
+ numsyllable += get_syllable(st.substr(0, i));
// + 1 word, if syllable number of the prefix > 1 (hungarian
// convention)
--
2.7.4
......@@ -21,15 +21,6 @@ $(eval $(call gb_UnpackedTarball_set_patchlevel,hunspell,1))
$(eval $(call gb_UnpackedTarball_add_patches,hunspell, \
external/hunspell/0001-Revert-Remove-autotools-autogenerated-files.patch \
external/hunspell/0001-unroll-this-a-bit.patch \
external/hunspell/0001-cppcheck-redundant-c_str.patch \
external/hunspell/0001-cppcheck-rv-is-reassigned-before-old-value-used.patch \
external/hunspell/0001-loop-via-iterators.patch \
external/hunspell/0002-add-a-get_clen_and_captype-varient-that-takes-a-buff.patch \
external/hunspell/0003-hoist-string-lowering-from-ngram-to-ngsuggest.patch \
external/hunspell/0004-either-clear-will-be-called-anyway-before-use-or-its.patch \
external/hunspell/0002-fix-other-regression-in-compounding.patch \
external/hunspell/0005-fix-syllable-counting-in-compound-word-handling.patch \
))
# vim: set noet sw=4 ts=4:
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment