Discussion:
Collation-based sorting + diacritical marks stripping for search are now in -pu
Gregory Petrosyan
2010-12-24 15:46:01 UTC
Permalink
e.g. searching for Bjork finds Björk, or searching for
Trentemöller finds Trentemøller
Collation-based sorting + diacritical marks stripping for search are now in -pu!

Please everybody check it out and enjoy — it's an amazing feature indeed!

                Gregory
Johannes Weißl
2011-01-10 17:14:24 UTC
Permalink
e.g. Ǽ -> A instead of Ǽ -> Æ
---
scripts/gen_decomp.py | 11 ++++++-----
unidecomp.h | 12 ++++++------
2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/scripts/gen_decomp.py b/scripts/gen_decomp.py
index b27ec9b..fd944c3 100755
--- a/scripts/gen_decomp.py
+++ b/scripts/gen_decomp.py
@@ -73,14 +73,15 @@ def unidata_expand_decomp(unidata):

def unidata_add_mapping(unidata, mapping):
for k, v in mapping.items():
- if ord(k) not in unidata:
- unidata[ord(k)] = {'decomp': [ord(v)]}
+ unidata[ord(k)]['decomp'] = [ord(v)]

def is_diacritical_mark(c):
return c >= 0x0300 and c <= 0x036F

-def filter_unidata(unidata):
+def filter_unidata(unidata, include):
for k, v in unidata.items():
+ if k in include:
+ continue
if not v['decomp']:
del unidata[k]
continue
@@ -144,9 +145,9 @@ from unicode.org or use `--wget' option.''' % unidata_filename)
unidata = parse_unidata(unidata_file)
unidata_file.close()

- unidata_expand_decomp(unidata)
- filter_unidata(unidata)
unidata_add_mapping(unidata, special_decompositions)
+ unidata_expand_decomp(unidata)
+ filter_unidata(unidata, [ord(x) for x in special_decompositions])

outfile = sys.stdout
if options.output:
diff --git a/unidecomp.h b/unidecomp.h
index c0aef52..b9de9ab 100644
--- a/unidecomp.h
+++ b/unidecomp.h
@@ -203,8 +203,8 @@ static struct {
{ 0x1df, 0x61 }, // ǟ -> a, ̈ (308), ̄ (304)
{ 0x1e0, 0x41 }, // Ǡ -> A, ̇ (307), ̄ (304)
{ 0x1e1, 0x61 }, // ǡ -> a, ̇ (307), ̄ (304)
- { 0x1e2, 0xc6 }, // Ǣ -> Æ, ̄ (304)
- { 0x1e3, 0xe6 }, // ǣ -> æ, ̄ (304)
+ { 0x1e2, 0x41 }, // Ǣ -> A, ̄ (304)
+ { 0x1e3, 0x61 }, // ǣ -> a, ̄ (304)
{ 0x1e6, 0x47 }, // Ǧ -> G, ̌ (30c)
{ 0x1e7, 0x67 }, // ǧ -> g, ̌ (30c)
{ 0x1e8, 0x4b }, // Ǩ -> K, ̌ (30c)
@@ -222,10 +222,10 @@ static struct {
{ 0x1f9, 0x6e }, // ǹ -> n, ̀ (300)
{ 0x1fa, 0x41 }, // Ǻ -> A, ̊ (30a), ́ (301)
{ 0x1fb, 0x61 }, // ǻ -> a, ̊ (30a), ́ (301)
- { 0x1fc, 0xc6 }, // Ǽ -> Æ, ́ (301)
- { 0x1fd, 0xe6 }, // ǽ -> æ, ́ (301)
- { 0x1fe, 0xd8 }, // Ǿ -> Ø, ́ (301)
- { 0x1ff, 0xf8 }, // ǿ -> ø, ́ (301)
+ { 0x1fc, 0x41 }, // Ǽ -> A, ́ (301)
+ { 0x1fd, 0x61 }, // ǽ -> a, ́ (301)
+ { 0x1fe, 0x4f }, // Ǿ -> O, ́ (301)
+ { 0x1ff, 0x6f }, // ǿ -> o, ́ (301)
{ 0x200, 0x41 }, // Ȁ -> A, ̏ (30f)
{ 0x201, 0x61 }, // ȁ -> a, ̏ (30f)
{ 0x202, 0x41 }, // Ȃ -> A, ̑ (311)
--
1.7.2.3
Gregory Petrosyan
2011-01-14 16:58:33 UTC
Permalink
Post by Johannes Weißl
e.g. Ǽ -> A instead of Ǽ -> Æ
Pushed to master, with the rest of unicode stuff from -pu. Thanks!

Gregory

Loading...