Discussion:
[PATCH] Map unicode punctation/quotation characters to ASCII ones
Johannes Weißl
2011-06-27 13:54:24 UTC
Permalink
http://wiki.musicbrainz.org/User:Jacobbrett/English_Punctuation_Guide
---
scripts/gen_decomp.py | 15 +++++++++++++++
unidecomp.h | 14 ++++++++++++++
2 files changed, 29 insertions(+), 0 deletions(-)

diff --git a/scripts/gen_decomp.py b/scripts/gen_decomp.py
index 1062ec3..c5f5388 100755
--- a/scripts/gen_decomp.py
+++ b/scripts/gen_decomp.py
@@ -37,6 +37,21 @@ special_decompositions = {
u'ð': u'd',
u'ø': u'o',
u'þ': u'p',
+# Various punctation/quotation characters
+ u'‐': u'-',
+ u'‒': u'-',
+ u'–': u'-',
+ u'−': u'-',
+ u'—': u'-',
+ u'―': u'-',
+ u'‘': u"'",
+ u'’': u"'",
+ u'′': u"'",
+ u'“': u'"',
+ u'”': u'"',
+ u'″': u'"',
+ u'〃': u'"',
+ u'…': u'.',
}

def parse_unidata(f):
diff --git a/unidecomp.h b/unidecomp.h
index b9de9ab..55e4206 100644
--- a/unidecomp.h
+++ b/unidecomp.h
@@ -806,6 +806,18 @@ static struct {
{ 0x1ffa, 0x3a9 }, // Ὼ -> Ω, ̀ (300)
{ 0x1ffb, 0x3a9 }, // Ώ -> Ω, ́ (301)
{ 0x1ffc, 0x3a9 }, // ῼ -> Ω, ͅ (345)
+ { 0x2010, 0x2d }, // ‐ -> -,
+ { 0x2012, 0x2d }, // ‒ -> -,
+ { 0x2013, 0x2d }, // – -> -,
+ { 0x2014, 0x2d }, // — -> -,
+ { 0x2015, 0x2d }, // ― -> -,
+ { 0x2018, 0x27 }, // ‘ -> ',
+ { 0x2019, 0x27 }, // ’ -> ',
+ { 0x201c, 0x22 }, // “ -> ",
+ { 0x201d, 0x22 }, // ” -> ",
+ { 0x2026, 0x2e }, // … -> .,
+ { 0x2032, 0x27 }, // ′ -> ',
+ { 0x2033, 0x22 }, // ″ -> ",
{ 0x212b, 0x41 }, // Å -> A, ̊ (30a)
{ 0x219a, 0x2190 }, // ↚ -> ←, ̸ (338)
{ 0x219b, 0x2192 }, // ↛ -> →, ̸ (338)
@@ -816,6 +828,7 @@ static struct {
{ 0x2204, 0x2203 }, // ∄ -> ∃, ̸ (338)
{ 0x2209, 0x2208 }, // ∉ -> ∈, ̸ (338)
{ 0x220c, 0x220b }, // ∌ -> ∋, ̸ (338)
+ { 0x2212, 0x2d }, // − -> -,
{ 0x2224, 0x2223 }, // ∤ -> ∣, ̸ (338)
{ 0x2226, 0x2225 }, // ∦ -> ∥, ̸ (338)
{ 0x2241, 0x223c }, // ≁ -> ∼, ̸ (338)
@@ -852,4 +865,5 @@ static struct {
{ 0x22ec, 0x22b4 }, // ⋬ -> ⊴, ̸ (338)
{ 0x22ed, 0x22b5 }, // ⋭ -> ⊵, ̸ (338)
{ 0x2adc, 0x2add }, // ⫝̸ -> ⫝, ̸ (338)
+ { 0x3003, 0x22 }, // 〃 -> ",
};
--
1.7.6.rc2
Gregory Petrosyan
2011-06-27 19:18:27 UTC
Permalink
Post by Johannes Weißl
http://wiki.musicbrainz.org/User:Jacobbrett/English_Punctuation_Guide
---
scripts/gen_decomp.py | 15 +++++++++++++++
unidecomp.h | 14 ++++++++++++++
2 files changed, 29 insertions(+), 0 deletions(-)
Thanks, merged to master and maint!

More of the fancy stuff I like -- yay :-)

Gregory
Johannes Weißl
2011-06-27 20:06:31 UTC
Permalink
Post by Gregory Petrosyan
Thanks, merged to master and maint!
More of the fancy stuff I like -- yay :-)
Thanks! Well, I discovered I have a few of this characters in my tags,
but now I converted them all back to ASCII (which is better for
scrobbling to last.fm). But the changes are still good for people having
them in their tags!

Johannes

Loading...