Discussion:
[PATCH 3/7] strip unneeded extern keyword
Johannes Weißl
2010-12-23 13:45:49 UTC
Permalink
---
uchar.h | 28 ++++++++++++++--------------
1 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/uchar.h b/uchar.h
index 07870bd..e15f8bd 100644
--- a/uchar.h
+++ b/uchar.h
@@ -61,28 +61,28 @@ static inline int u_char_size(uchar uch)
/*
* Returns width of @uch (normally 1 or 2, 4 for invalid chars (<xx>))
*/
-extern int u_char_width(uchar uch);
+int u_char_width(uchar uch);

/*
* @str any null-terminated string
*
* Returns 1 if @str is valid UTF-8 string, 0 otherwise.
*/
-extern int u_is_valid(const char *str);
+int u_is_valid(const char *str);

/*
* @str null-terminated UTF-8 string
*
* Retuns length of @str in UTF-8 characters.
*/
-extern int u_strlen(const char *str);
+int u_strlen(const char *str);

/*
* @str null-terminated UTF-8 string
*
* Retuns width of @str.
*/
-extern int u_str_width(const char *str);
+int u_str_width(const char *str);

/*
* @str null-terminated UTF-8 string
@@ -90,24 +90,24 @@ extern int u_str_width(const char *str);
*
* Retuns width of the first @len characters in @str.
*/
-extern int u_str_nwidth(const char *str, int len);
+int u_str_nwidth(const char *str, int len);

-extern void u_prev_char_pos(const char *str, int *idx);
+void u_prev_char_pos(const char *str, int *idx);

/*
* @str null-terminated UTF-8 string
* @idx pointer to byte index in @str (not UTF-8 character index!)
* @uch pointer to returned unicode character
*/
-extern void u_get_char(const char *str, int *idx, uchar *uch);
+void u_get_char(const char *str, int *idx, uchar *uch);

/*
* @str destination buffer
* @idx pointer to byte index in @str (not UTF-8 character index!)
* @uch unicode character
*/
-extern void u_set_char_raw(char *str, int *idx, uchar uch);
-extern void u_set_char(char *str, int *idx, uchar uch);
+void u_set_char_raw(char *str, int *idx, uchar uch);
+void u_set_char(char *str, int *idx, uchar uch);

/*
* @dst destination buffer
@@ -120,7 +120,7 @@ extern void u_set_char(char *str, int *idx, uchar uch);
*
* Returns number of _bytes_ copied.
*/
-extern int u_copy_chars(char *dst, const char *src, int *width);
+int u_copy_chars(char *dst, const char *src, int *width);

/*
* @str null-terminated UTF-8 string, must be long enough
@@ -133,11 +133,11 @@ extern int u_copy_chars(char *dst, const char *src, int *width);
*
* Returns number of _bytes_ skipped.
*/
-extern int u_skip_chars(const char *str, int *width);
+int u_skip_chars(const char *str, int *width);

-extern int u_strcasecmp(const char *a, const char *b);
-extern int u_strncasecmp(const char *a, const char *b, int len);
-extern char *u_strcasestr(const char *haystack, const char *needle);
+int u_strcasecmp(const char *a, const char *b);
+int u_strncasecmp(const char *a, const char *b, int len);
+char *u_strcasestr(const char *haystack, const char *needle);

static inline char *u_strcasestr_filename(const char *haystack, const char *needle)
{
--
1.7.2.3
Johannes Weißl
2010-12-23 13:45:48 UTC
Permalink
Reason: need convert() iconv wrapper for collate functions
---
Makefile | 4 +-
convert.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
convert.h | 32 ++++++++++++++++++
id3.c | 2 +-
utf8_encode.c | 69 ---------------------------------------
utf8_encode.h | 25 --------------
6 files changed, 136 insertions(+), 97 deletions(-)
create mode 100644 convert.c
create mode 100644 convert.h
delete mode 100644 utf8_encode.c
delete mode 100644 utf8_encode.h

diff --git a/Makefile b/Makefile
index daff8e2..53a3cc8 100644
--- a/Makefile
+++ b/Makefile
@@ -31,14 +31,14 @@ main.o server.o: CFLAGS += -DDEFAULT_PORT=3000
# programs {{{
cmus-y := \
ape.o browser.o buffer.o cache.o cmdline.o cmus.o command_mode.o comment.o \
- debug.o editable.o expr.o filters.o \
+ convert.lo debug.o editable.o expr.o filters.o \
format_print.o gbuf.o glob.o help.o history.o http.o id3.o input.o job.o \
keys.o keyval.o lib.o load_dir.o locking.o mergesort.o misc.o options.o \
output.o pcm.o pl.o play_queue.o player.o \
rbtree.o read_wrapper.o server.o search.o \
search_mode.o spawn.o tabexp.o tabexp_file.o \
track.o track_info.o tree.o uchar.o ui_curses.o \
- utf8_encode.lo window.o worker.o xstrjoin.o
+ window.o worker.o xstrjoin.o

$(cmus-y): CFLAGS += $(PTHREAD_CFLAGS) $(NCURSES_CFLAGS) $(ICONV_CFLAGS) $(DL_CFLAGS)

diff --git a/convert.c b/convert.c
new file mode 100644
index 0000000..27a266e
--- /dev/null
+++ b/convert.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2004 Timo Hirvonen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include "convert.h"
+#include "xmalloc.h"
+
+#include <iconv.h>
+#include <string.h>
+#include <errno.h>
+
+ssize_t convert(const char *inbuf, ssize_t inbuf_size,
+ char **outbuf, ssize_t outbuf_estimate,
+ const char *tocode, const char *fromcode)
+{
+ const char *in;
+ char *out;
+ size_t outbuf_size, inbytesleft, outbytesleft;
+ iconv_t cd;
+ int rc, finished = 0, err_save;
+
+ cd = iconv_open(tocode, fromcode);
+ if (cd == (iconv_t) -1)
+ return -1;
+
+ if (inbuf_size < 0)
+ inbuf_size = strlen(inbuf);
+ inbytesleft = inbuf_size;
+
+ if (outbuf_estimate < 0)
+ outbuf_size = inbuf_size;
+ else
+ outbuf_size = outbuf_estimate;
+ outbytesleft = outbuf_size;
+
+ in = inbuf;
+ out = *outbuf = xnew(char, outbuf_size + 1);
+
+ while (!finished) {
+ finished = 1;
+ rc = iconv(cd, (char **)&in, &inbytesleft, &out, &outbytesleft);
+ if (rc == (size_t) -1) {
+ if (errno == E2BIG) {
+ size_t used = out - *outbuf;
+ outbytesleft += outbuf_size;
+ outbuf_size *= 2;
+ *outbuf = xrenew(char, *outbuf, outbuf_size + 1);
+ out = *outbuf + used;
+ continue;
+ } else if (errno != EINVAL)
+ goto error;
+ }
+ }
+ /* NUL-terminate for safety reasons */
+ *out = '\0';
+ iconv_close(cd);
+ return outbuf_size - outbytesleft;
+
+error:
+ err_save = errno;
+ free(*outbuf);
+ *outbuf = NULL;
+ iconv_close(cd);
+ errno = err_save;
+ return -1;
+}
+
+int utf8_encode(const char *inbuf, const char *encoding, char **outbuf)
+{
+ size_t inbuf_size, outbuf_size, i;
+ int rc;
+
+ inbuf_size = strlen(inbuf);
+ outbuf_size = inbuf_size;
+ for (i = 0; i < inbuf_size; i++) {
+ unsigned char ch;
+
+ ch = inbuf[i];
+ if (ch > 127)
+ outbuf_size++;
+ }
+
+ rc = convert(inbuf, inbuf_size, outbuf, outbuf_size, "UTF-8", encoding);
+
+ return rc < 0 ? -1 : 0;
+}
diff --git a/convert.h b/convert.h
new file mode 100644
index 0000000..8b48381
--- /dev/null
+++ b/convert.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2004 Timo Hirvonen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#ifndef CONVERT_H
+#define CONVERT_H
+
+#include <sys/types.h> /* ssize_t */
+
+/* Returns length of *outbuf in bytes (without closing '\0'), -1 on error. */
+extern ssize_t convert(const char *inbuf, ssize_t inbuf_size,
+ char **outbuf, ssize_t outbuf_estimate,
+ const char *tocode, const char *fromcode);
+
+extern int utf8_encode(const char *inbuf, const char *encoding, char **outbuf);
+
+#endif
diff --git a/id3.c b/id3.c
index e2f427b..dcb7d9a 100644
--- a/id3.c
+++ b/id3.c
@@ -5,7 +5,7 @@
#include "id3.h"
#include "comment.h"
#include "xmalloc.h"
-#include "utf8_encode.h"
+#include "convert.h"
#include "uchar.h"
#include "options.h"
#include "debug.h"
diff --git a/utf8_encode.c b/utf8_encode.c
deleted file mode 100644
index 46ae6c3..0000000
--- a/utf8_encode.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright 2004 Timo Hirvonen
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
- */
-
-#include "utf8_encode.h"
-#include "xmalloc.h"
-
-#include <iconv.h>
-#include <string.h>
-#include <errno.h>
-
-int utf8_encode(const char *inbuf, const char *encoding, char **outbuf)
-{
- const char *in;
- char *out;
- size_t inbuf_size, outbuf_size, i;
- iconv_t cd;
- int rc;
-
- cd = iconv_open("UTF-8", encoding);
- if (cd == (iconv_t)-1)
- return -1;
- inbuf_size = strlen(inbuf);
- outbuf_size = inbuf_size;
- for (i = 0; i < inbuf_size; i++) {
- unsigned char ch;
-
- ch = inbuf[i];
- if (ch > 127)
- outbuf_size++;
- }
- *outbuf = xnew(char, outbuf_size + 1);
- in = inbuf;
- out = *outbuf;
- rc = iconv(cd, (char **)&in, &inbuf_size, &out, &outbuf_size);
- *out = 0;
- if (rc == -1) {
- int save = errno;
- iconv_close(cd);
- free(*outbuf);
- *outbuf = NULL;
- errno = save;
- return -1;
- }
- rc = iconv_close(cd);
- if (rc == -1) {
- int save = errno;
- free(*outbuf);
- *outbuf = NULL;
- errno = save;
- return -1;
- }
- return 0;
-}
diff --git a/utf8_encode.h b/utf8_encode.h
deleted file mode 100644
index 0b330a1..0000000
--- a/utf8_encode.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright 2004 Timo Hirvonen
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
- */
-
-#ifndef _UTF8_ENCODE_H
-#define _UTF8_ENCODE_H
-
-extern int utf8_encode(const char *inbuf, const char *encoding, char **outbuf);
-
-#endif
--
1.7.2.3
Gregory Petrosyan
2010-12-23 22:37:32 UTC
Permalink
Post by Johannes Weißl
+int utf8_encode(const char *inbuf, const char *encoding, char **outbuf)
+{
+ size_t inbuf_size, outbuf_size, i;
+ int rc;
+
+ inbuf_size = strlen(inbuf);
+ outbuf_size = inbuf_size;
+ for (i = 0; i < inbuf_size; i++) {
+ unsigned char ch;
+
+ ch = inbuf[i];
+ if (ch > 127)
+ outbuf_size++;
+ }
+
+ rc = convert(inbuf, inbuf_size, outbuf, outbuf_size, "UTF-8", encoding);
+
+ return rc < 0 ? -1 : 0;
+}
Is all this outbuf_size guessing business really worth it? AFAIK e.g. all Russian
characters are 3 bytes in UTF-8, so reallocation will occur despite all the
counting. Even 1 3-byte UTF-8 character will result in a reallocation.

Also, not sure if it is worth it, but it can be made a bit faster without strlen().
Post by Johannes Weißl
diff --git a/convert.h b/convert.h
+
+/* Returns length of *outbuf in bytes (without closing '\0'), -1 on error. */
+extern ssize_t convert(const char *inbuf, ssize_t inbuf_size,
+ char **outbuf, ssize_t outbuf_estimate,
+ const char *tocode, const char *fromcode);
+
+extern int utf8_encode(const char *inbuf, const char *encoding, char **outbuf);
Why leave them extern?

Gregory
Johannes Weißl
2010-12-24 10:26:57 UTC
Permalink
Post by Gregory Petrosyan
Post by Johannes Weißl
+ if (ch > 127)
+ outbuf_size++;
Is all this outbuf_size guessing business really worth it? AFAIK e.g.
all Russian characters are 3 bytes in UTF-8, so reallocation will
occur despite all the counting. Even 1 3-byte UTF-8 character will
result in a reallocation.
Yes, I probably wouldn't have done it myself, but it was already in the
code. I would leave it as it is, since one strlen() too much doesn't
hurt and it avoids reallocating for latin1. Also for Russian it saves
one reallocation step (1 instead of 2).
Post by Gregory Petrosyan
Post by Johannes Weißl
+extern ssize_t convert(const char *inbuf, ssize_t inbuf_size,
+ char **outbuf, ssize_t outbuf_estimate,
+ const char *tocode, const char *fromcode);
+
+extern int utf8_encode(const char *inbuf, const char *encoding, char **outbuf);
Why leave them extern?
Just forgot to strip it :-)


Johannes
Johannes Weißl
2010-12-24 10:44:18 UTC
Permalink
Reason: need convert() iconv wrapper for collate functions
---
Makefile | 4 +-
convert.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
convert.h | 32 ++++++++++++++++++
id3.c | 2 +-
utf8_encode.c | 69 ---------------------------------------
utf8_encode.h | 25 --------------
6 files changed, 136 insertions(+), 97 deletions(-)
create mode 100644 convert.c
create mode 100644 convert.h
delete mode 100644 utf8_encode.c
delete mode 100644 utf8_encode.h

diff --git a/Makefile b/Makefile
index daff8e2..53a3cc8 100644
--- a/Makefile
+++ b/Makefile
@@ -31,14 +31,14 @@ main.o server.o: CFLAGS += -DDEFAULT_PORT=3000
# programs {{{
cmus-y := \
ape.o browser.o buffer.o cache.o cmdline.o cmus.o command_mode.o comment.o \
- debug.o editable.o expr.o filters.o \
+ convert.lo debug.o editable.o expr.o filters.o \
format_print.o gbuf.o glob.o help.o history.o http.o id3.o input.o job.o \
keys.o keyval.o lib.o load_dir.o locking.o mergesort.o misc.o options.o \
output.o pcm.o pl.o play_queue.o player.o \
rbtree.o read_wrapper.o server.o search.o \
search_mode.o spawn.o tabexp.o tabexp_file.o \
track.o track_info.o tree.o uchar.o ui_curses.o \
- utf8_encode.lo window.o worker.o xstrjoin.o
+ window.o worker.o xstrjoin.o

$(cmus-y): CFLAGS += $(PTHREAD_CFLAGS) $(NCURSES_CFLAGS) $(ICONV_CFLAGS) $(DL_CFLAGS)

diff --git a/convert.c b/convert.c
new file mode 100644
index 0000000..27a266e
--- /dev/null
+++ b/convert.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2004 Timo Hirvonen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include "convert.h"
+#include "xmalloc.h"
+
+#include <iconv.h>
+#include <string.h>
+#include <errno.h>
+
+ssize_t convert(const char *inbuf, ssize_t inbuf_size,
+ char **outbuf, ssize_t outbuf_estimate,
+ const char *tocode, const char *fromcode)
+{
+ const char *in;
+ char *out;
+ size_t outbuf_size, inbytesleft, outbytesleft;
+ iconv_t cd;
+ int rc, finished = 0, err_save;
+
+ cd = iconv_open(tocode, fromcode);
+ if (cd == (iconv_t) -1)
+ return -1;
+
+ if (inbuf_size < 0)
+ inbuf_size = strlen(inbuf);
+ inbytesleft = inbuf_size;
+
+ if (outbuf_estimate < 0)
+ outbuf_size = inbuf_size;
+ else
+ outbuf_size = outbuf_estimate;
+ outbytesleft = outbuf_size;
+
+ in = inbuf;
+ out = *outbuf = xnew(char, outbuf_size + 1);
+
+ while (!finished) {
+ finished = 1;
+ rc = iconv(cd, (char **)&in, &inbytesleft, &out, &outbytesleft);
+ if (rc == (size_t) -1) {
+ if (errno == E2BIG) {
+ size_t used = out - *outbuf;
+ outbytesleft += outbuf_size;
+ outbuf_size *= 2;
+ *outbuf = xrenew(char, *outbuf, outbuf_size + 1);
+ out = *outbuf + used;
+ continue;
+ } else if (errno != EINVAL)
+ goto error;
+ }
+ }
+ /* NUL-terminate for safety reasons */
+ *out = '\0';
+ iconv_close(cd);
+ return outbuf_size - outbytesleft;
+
+error:
+ err_save = errno;
+ free(*outbuf);
+ *outbuf = NULL;
+ iconv_close(cd);
+ errno = err_save;
+ return -1;
+}
+
+int utf8_encode(const char *inbuf, const char *encoding, char **outbuf)
+{
+ size_t inbuf_size, outbuf_size, i;
+ int rc;
+
+ inbuf_size = strlen(inbuf);
+ outbuf_size = inbuf_size;
+ for (i = 0; i < inbuf_size; i++) {
+ unsigned char ch;
+
+ ch = inbuf[i];
+ if (ch > 127)
+ outbuf_size++;
+ }
+
+ rc = convert(inbuf, inbuf_size, outbuf, outbuf_size, "UTF-8", encoding);
+
+ return rc < 0 ? -1 : 0;
+}
diff --git a/convert.h b/convert.h
new file mode 100644
index 0000000..45ef667
--- /dev/null
+++ b/convert.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2004 Timo Hirvonen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#ifndef CONVERT_H
+#define CONVERT_H
+
+#include <sys/types.h> /* ssize_t */
+
+/* Returns length of *outbuf in bytes (without closing '\0'), -1 on error. */
+ssize_t convert(const char *inbuf, ssize_t inbuf_size,
+ char **outbuf, ssize_t outbuf_estimate,
+ const char *tocode, const char *fromcode);
+
+int utf8_encode(const char *inbuf, const char *encoding, char **outbuf);
+
+#endif
diff --git a/id3.c b/id3.c
index e2f427b..dcb7d9a 100644
--- a/id3.c
+++ b/id3.c
@@ -5,7 +5,7 @@
#include "id3.h"
#include "comment.h"
#include "xmalloc.h"
-#include "utf8_encode.h"
+#include "convert.h"
#include "uchar.h"
#include "options.h"
#include "debug.h"
diff --git a/utf8_encode.c b/utf8_encode.c
deleted file mode 100644
index 46ae6c3..0000000
--- a/utf8_encode.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright 2004 Timo Hirvonen
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
- */
-
-#include "utf8_encode.h"
-#include "xmalloc.h"
-
-#include <iconv.h>
-#include <string.h>
-#include <errno.h>
-
-int utf8_encode(const char *inbuf, const char *encoding, char **outbuf)
-{
- const char *in;
- char *out;
- size_t inbuf_size, outbuf_size, i;
- iconv_t cd;
- int rc;
-
- cd = iconv_open("UTF-8", encoding);
- if (cd == (iconv_t)-1)
- return -1;
- inbuf_size = strlen(inbuf);
- outbuf_size = inbuf_size;
- for (i = 0; i < inbuf_size; i++) {
- unsigned char ch;
-
- ch = inbuf[i];
- if (ch > 127)
- outbuf_size++;
- }
- *outbuf = xnew(char, outbuf_size + 1);
- in = inbuf;
- out = *outbuf;
- rc = iconv(cd, (char **)&in, &inbuf_size, &out, &outbuf_size);
- *out = 0;
- if (rc == -1) {
- int save = errno;
- iconv_close(cd);
- free(*outbuf);
- *outbuf = NULL;
- errno = save;
- return -1;
- }
- rc = iconv_close(cd);
- if (rc == -1) {
- int save = errno;
- free(*outbuf);
- *outbuf = NULL;
- errno = save;
- return -1;
- }
- return 0;
-}
diff --git a/utf8_encode.h b/utf8_encode.h
deleted file mode 100644
index 0b330a1..0000000
--- a/utf8_encode.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright 2004 Timo Hirvonen
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
- */
-
-#ifndef _UTF8_ENCODE_H
-#define _UTF8_ENCODE_H
-
-extern int utf8_encode(const char *inbuf, const char *encoding, char **outbuf);
-
-#endif
--
1.7.2.3
Johannes Weißl
2010-12-23 13:45:50 UTC
Permalink
---
uchar.c | 46 +++++++++++++++++-----------------------------
uchar.h | 16 +++++++++++++++-
2 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/uchar.c b/uchar.c
index 6f342b0..ca79d02 100644
--- a/uchar.c
+++ b/uchar.c
@@ -76,6 +76,19 @@ static const signed char len_tab[256] = {
-1, -1, -1, -1, -1, -1, -1, -1
};

+/* fault-tolerant equivalent to len_tab, from glib */
+static const char utf8_skip_data[256] = {
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+};
+const char * const utf8_skip = utf8_skip_data;
+
/* index is length of the UTF-8 sequence - 1 */
static int min_val[4] = { 0x000000, 0x000080, 0x000800, 0x010000 };
static int max_val[4] = { 0x00007f, 0x0007ff, 0x00ffff, 0x10ffff };
@@ -117,36 +130,11 @@ int u_is_valid(const char *str)
return 1;
}

-int u_strlen(const char *str)
+size_t u_strlen(const char *str)
{
- const unsigned char *s = (const unsigned char *)str;
- int len = 0;
-
- while (*s) {
- int l = len_tab[*s];
-
- if (unlikely(l > 1)) {
- /* next l - 1 bytes must be 0x10xxxxxx */
- int c = 1;
- do {
- if (len_tab[s[c]] != 0) {
- /* invalid sequence */
- goto single_char;
- }
- c++;
- } while (c < l);
-
- /* valid sequence */
- s += l;
- len++;
- continue;
- }
-single_char:
- /* l is -1, 0 or 1
- * invalid chars counted as single characters */
- s++;
- len++;
- }
+ size_t len;
+ for (len = 0; *str; len++)
+ str = u_next_char(str);
return len;
}

diff --git a/uchar.h b/uchar.h
index e15f8bd..f6fb56d 100644
--- a/uchar.h
+++ b/uchar.h
@@ -20,6 +20,8 @@
#ifndef _UCHAR_H
#define _UCHAR_H

+#include <stddef.h> /* size_t */
+
typedef unsigned int uchar;

extern const char hex_tab[16];
@@ -73,9 +75,21 @@ int u_is_valid(const char *str);
/*
* @str null-terminated UTF-8 string
*
+ * Returns position of next unicode character in @str.
+ * (fast and fault-tolerant)
+ */
+extern const char * const utf8_skip;
+static inline char *u_next_char(const char *str)
+{
+ return (char *) (str + utf8_skip[*((const unsigned char *) str)]);
+}
+
+/*
+ * @str null-terminated UTF-8 string
+ *
* Retuns length of @str in UTF-8 characters.
*/
-int u_strlen(const char *str);
+size_t u_strlen(const char *str);

/*
* @str null-terminated UTF-8 string
--
1.7.2.3
Gregory Petrosyan
2010-12-23 22:48:33 UTC
Permalink
Post by Johannes Weißl
uchar.c | 46 +++++++++++++++++-----------------------------
uchar.h | 16 +++++++++++++++-
This version is definitely more concise. Just curious: is it actually faster,
and by what margin?

Gregory
Johannes Weißl
2010-12-24 10:27:21 UTC
Permalink
Post by Gregory Petrosyan
This version is definitely more concise. Just curious: is it actually faster,
and by what margin?
It can be twice as fast, and is never slower. And since it's also more
concise, there was not much to think about :-).

here are my results:
* on 104 MiB ASCII
old strlen: 0.796943s
new strlen: 0.665816s (
* on 126 MiB latin1 UTF-8
new strlen: 0.414058s
old strlen: 0.975103s
* on 128 MiB complicated UTF-8
new strlen: 0.448511s
old strlen: 1.093729s
Johannes Weißl
2010-12-23 13:45:51 UTC
Permalink
---
ui_curses.c | 12 +++++-------
ui_curses.h | 3 +++
2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/ui_curses.c b/ui_curses.c
index 568cd3e..0472aaa 100644
--- a/ui_curses.c
+++ b/ui_curses.c
@@ -86,6 +86,9 @@ char *pl_filename = NULL;
char *pl_ext_filename = NULL;
char *play_queue_filename = NULL;
char *play_queue_ext_filename = NULL;
+char *charset = NULL;
+int using_utf8 = 0;
+

/* ------------------------------------------------------------------------- */

@@ -103,7 +106,6 @@ static int error_count = 0;

static char *server_address = NULL;

-static char *charset = NULL;
static char print_buffer[512];

/* destination buffer for utf8_encode and utf8_decode */
@@ -112,8 +114,6 @@ static char conv_buffer[512];
/* one character can take up to 4 bytes in UTF-8 */
#define print_buffer_max_width (sizeof(print_buffer) / 4 - 1)

-static int using_utf8;
-
/* used for messages to the client */
static int client_fd = -1;

@@ -2250,11 +2250,9 @@ int main(int argc, char *argv[])
#else
charset = "ISO-8859-1";
#endif
- if (strcmp(charset, "UTF-8") == 0) {
+ if (strcmp(charset, "UTF-8") == 0)
using_utf8 = 1;
- } else {
- using_utf8 = 0;
- }
+
misc_init();
if (server_address == NULL)
server_address = xstrjoin(cmus_config_dir, "/socket");
diff --git a/ui_curses.h b/ui_curses.h
index 7234bc2..9bd32eb 100644
--- a/ui_curses.h
+++ b/ui_curses.h
@@ -42,6 +42,9 @@ extern char *pl_ext_filename;
extern char *play_queue_filename;
extern char *play_queue_ext_filename;

+extern char *charset;
+extern int using_utf8;
+
void update_titleline(void);
void update_statusline(void);
void update_colors(void);
--
1.7.2.3
Johannes Weißl
2010-12-23 13:45:52 UTC
Permalink
* Replace u_strcasecmp by u_strcasecoll and u_strcase_equal, since
unicode strings can't be sorted locale-independent.
* Only use towlower() on systems where wchar_t is UCS-4.
---
Makefile | 2 +-
comment.c | 2 +-
glob.c | 2 +-
track_info.c | 17 ++---------
tree.c | 13 +++-----
u_collate.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++
u_collate.h | 51 ++++++++++++++++++++++++++++++++++
uchar.c | 85 +++++++++++++++++++++++++++++++++++++--------------------
uchar.h | 35 ++++++++++++++++++++++-
ui_curses.c | 1 +
10 files changed, 230 insertions(+), 57 deletions(-)
create mode 100644 u_collate.c
create mode 100644 u_collate.h

diff --git a/Makefile b/Makefile
index 53a3cc8..aa3f032 100644
--- a/Makefile
+++ b/Makefile
@@ -37,7 +37,7 @@ cmus-y := \
output.o pcm.o pl.o play_queue.o player.o \
rbtree.o read_wrapper.o server.o search.o \
search_mode.o spawn.o tabexp.o tabexp_file.o \
- track.o track_info.o tree.o uchar.o ui_curses.o \
+ track.o track_info.o tree.o u_collate.o uchar.o ui_curses.o \
window.o worker.o xstrjoin.o

$(cmus-y): CFLAGS += $(PTHREAD_CFLAGS) $(NCURSES_CFLAGS) $(ICONV_CFLAGS) $(DL_CFLAGS)
diff --git a/comment.c b/comment.c
index 042e930..a0e1674 100644
--- a/comment.c
+++ b/comment.c
@@ -29,7 +29,7 @@ int track_is_compilation(const struct keyval *comments)
return 1;

a = keyvals_get_val(comments, "artist");
- if (aa && a && u_strcasecmp(aa, a) != 0)
+ if (aa && a && !u_strcase_equal(aa, a))
return 1;

return 0;
diff --git a/glob.c b/glob.c
index ebca873..b4b0627 100644
--- a/glob.c
+++ b/glob.c
@@ -179,7 +179,7 @@ static int do_glob_match(struct list_head *head, struct list_head *first, const
if (gitem->type == GLOB_TEXT) {
int len = u_strlen(gitem->text);

- if (u_strncasecmp(gitem->text, text, len))
+ if (!u_strncase_equal(gitem->text, text, len))
return 0;
text += strlen(gitem->text);
} else if (gitem->type == GLOB_QMARK) {
diff --git a/track_info.c b/track_info.c
index 9190106..6a519c2 100644
--- a/track_info.c
+++ b/track_info.c
@@ -20,6 +20,7 @@
#include "track_info.h"
#include "comment.h"
#include "uchar.h"
+#include "u_collate.h"
#include "misc.h"
#include "xmalloc.h"
#include "utils.h"
@@ -113,18 +114,6 @@ int track_info_matches(struct track_info *ti, const char *text, unsigned int fla
return matched;
}

-static int xstrcasecmp(const char *a, const char *b)
-{
- if (a == NULL) {
- if (b == NULL)
- return 0;
- return -1;
- } else if (b == NULL) {
- return 1;
- }
- return u_strcasecmp(a, b);
-}
-
int track_info_cmp(const struct track_info *a, const struct track_info *b, const char * const *keys)
{
int i, res = 0;
@@ -158,7 +147,7 @@ int track_info_cmp(const struct track_info *a, const struct track_info *b, const
if (strcmp(key, "albumartist") == 0) {
av = comments_get_albumartist(a->comments);
bv = comments_get_albumartist(b->comments);
- res = xstrcasecmp(av, bv);
+ res = u_strcasecoll0(av, bv);
if (res)
break;
continue;
@@ -172,7 +161,7 @@ int track_info_cmp(const struct track_info *a, const struct track_info *b, const

av = keyvals_get_val(a->comments, key);
bv = keyvals_get_val(b->comments, key);
- res = xstrcasecmp(av, bv);
+ res = u_strcasecoll0(av, bv);
if (res)
break;
}
diff --git a/tree.c b/tree.c
index 3cf5ac6..aa53784 100644
--- a/tree.c
+++ b/tree.c
@@ -10,6 +10,7 @@
#include "debug.h"
#include "mergesort.h"
#include "options.h"
+#include "u_collate.h"

#include <ctype.h>
#include <stdio.h>
@@ -400,14 +401,10 @@ static void find_artist_and_album(const char *artist_name,
struct album *album;

list_for_each_entry(artist, &lib_artist_head, node) {
- int res;
-
- res = u_strcasecmp(artist->name, artist_name);
- if (res == 0) {
+ if (u_strcase_equal(artist->name, artist_name)) {
*_artist = artist;
list_for_each_entry(album, &artist->album_head, node) {
- res = u_strcasecmp(album->name, album_name);
- if (res == 0) {
+ if (u_strcase_equal(album->name, album_name)) {
*_album = album;
return;
}
@@ -428,7 +425,7 @@ static int special_name_cmp(const char *a, const char *b)

if (cmp)
return cmp;
- return u_strcasecmp(a, b);
+ return u_strcasecoll(a, b);
}

static int special_album_cmp(const struct album *a, const struct album *b)
@@ -442,7 +439,7 @@ static int special_album_cmp(const struct album *a, const struct album *b)
if (a->date != b->date)
return a->date - b->date;

- return u_strcasecmp(a->name, b->name);
+ return u_strcasecoll(a->name, b->name);
}

static void insert_artist(struct artist *artist)
diff --git a/u_collate.c b/u_collate.c
new file mode 100644
index 0000000..8dba7af
--- /dev/null
+++ b/u_collate.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2010 Johannes Weißl (based on gunicollate.c from glib)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include "u_collate.h"
+#include "uchar.h"
+#include "xmalloc.h"
+#include "ui_curses.h" /* using_utf8, charset */
+#include "convert.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+int u_strcoll(const char *str1, const char *str2)
+{
+ int result;
+
+ if (using_utf8) {
+ result = strcoll(str1, str2);
+ } else {
+ char *str1_locale, *str2_locale;
+
+ convert(str1, -1, &str1_locale, -1, charset, "UTF-8");
+ convert(str2, -1, &str2_locale, -1, charset, "UTF-8");
+
+ if (str1_locale && str2_locale)
+ result = strcoll(str1_locale, str2_locale);
+ else
+ result = strcmp(str1, str2);
+
+ if (str2_locale)
+ free(str2_locale);
+ if (str1_locale)
+ free(str1_locale);
+ }
+
+ return result;
+}
+
+int u_strcasecoll(const char *str1, const char *str2)
+{
+ char *cf_a, *cf_b;
+ int res;
+
+ cf_a = u_casefold(str1);
+ cf_b = u_casefold(str2);
+
+ res = u_strcoll(cf_a, cf_b);
+
+ free(cf_b);
+ free(cf_a);
+
+ return res;
+}
+
+int u_strcasecoll0(const char *str1, const char *str2)
+{
+ if (!str1)
+ return str2 ? -1 : 0;
+ if (!str2)
+ return 1;
+
+ return u_strcasecoll(str1, str2);
+}
diff --git a/u_collate.h b/u_collate.h
new file mode 100644
index 0000000..576df57
--- /dev/null
+++ b/u_collate.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2010 Johannes Weißl
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#ifndef U_COLLATE_H
+#define U_COLLATE_H
+
+/*
+ * @str1 valid, normalized, null-terminated UTF-8 string
+ * @str2 valid, normalized, null-terminated UTF-8 string
+ *
+ * Compares two strings for ordering using the linguistically
+ * correct rules for the current locale.
+ *
+ * Returns -1 if @str1 compares before @str2, 0 if they compare equal,
+ * +1 if @str1 compares after @str2.
+ */
+int u_strcoll(const char *str1, const char *str2);
+
+/*
+ * @str1 valid, normalized, null-terminated UTF-8 string
+ * @str2 valid, normalized, null-terminated UTF-8 string
+ *
+ * Like u_strcoll(), but do casefolding before comparing.
+ */
+int u_strcasecoll(const char *str1, const char *str2);
+
+/*
+ * @str1 valid, normalized, null-terminated UTF-8 string or NULL
+ * @str2 valid, normalized, null-terminated UTF-8 string or NULL
+ *
+ * Like u_strcasecoll(), but handle NULL pointers gracefully.
+ */
+int u_strcasecoll0(const char *str1, const char *str2);
+
+#endif
diff --git a/uchar.c b/uchar.c
index ca79d02..5df3079 100644
--- a/uchar.c
+++ b/uchar.c
@@ -19,6 +19,7 @@

#include "uchar.h"
#include "compiler.h"
+#include "gbuf.h"

#include <stdlib.h>
#include <string.h>
@@ -455,57 +456,81 @@ int u_skip_chars(const char *str, int *width)
}

/*
- * Comparison functions
+ * Case-folding functions
*/

-static inline int chcasecmp(int a, int b)
+static inline uchar u_casefold_char(uchar ch)
{
- return towupper(a) - towupper(b);
+ /* faster lookup for for A-Z, rest of ASCII unaffected */
+ if (ch < 0x0041)
+ return ch;
+ if (ch <= 0x005A)
+ return ch + 0x20;
+#ifdef __STDC_ISO_10646__
+ if (ch < 128)
+ return ch;
+ ch = towlower(ch);
+#endif
+ return ch;
}

-int u_strcasecmp(const char *a, const char *b)
+char *u_casefold(const char *str)
{
- int ai = 0;
- int bi = 0;
- int res;
+ GBUF(out);
+ int i = 0;

- do {
+ while (str[i]) {
+ char buf[4];
+ int buflen = 0;
+ uchar ch;
+
+ u_get_char(str, &i, &ch);
+ ch = u_casefold_char(ch);
+ u_set_char_raw(buf, &buflen, ch);
+ gbuf_add_bytes(&out, buf, buflen);
+ }
+
+ return out.buffer;
+}
+
+/*
+ * Comparison functions
+ */
+
+int u_strcase_equal(const char *a, const char *b)
+{
+ int ai = 0, bi = 0;
+
+ while (a[ai]) {
uchar au, bu;

u_get_char(a, &ai, &au);
u_get_char(b, &bi, &bu);
- res = chcasecmp(au, bu);
- if (res)
- break;
- if (au == 0) {
- /* bu is 0 too */
- break;
- }
- } while (1);
- return res;
+
+ if (u_casefold_char(au) != u_casefold_char(bu))
+ return 0;
+ }
+
+ return b[bi] ? 0 : 1;
}

-int u_strncasecmp(const char *a, const char *b, int len)
+int u_strncase_equal(const char *a, const char *b, size_t len)
{
- int ai = 0;
- int bi = 0;
+ int ai = 0, bi = 0;

- while (len > 0) {
+ while (b[bi] && len > 0) {
uchar au, bu;
- int res;

u_get_char(a, &ai, &au);
u_get_char(b, &bi, &bu);
- res = chcasecmp(au, bu);
- if (res)
- return res;
- if (au == 0) {
- /* bu is 0 too */
+
+ if (u_casefold_char(au) != u_casefold_char(bu))
return 0;
- }
+
len--;
}
- return 0;
+
+ return 1;
}

char *u_strcasestr(const char *haystack, const char *needle)
@@ -520,7 +545,7 @@ char *u_strcasestr(const char *haystack, const char *needle)

if (haystack_len < needle_len)
return NULL;
- if (u_strncasecmp(needle, haystack, needle_len) == 0)
+ if (u_strncase_equal(needle, haystack, needle_len))
return (char *)haystack;

/* skip one char */
diff --git a/uchar.h b/uchar.h
index f6fb56d..21cd317 100644
--- a/uchar.h
+++ b/uchar.h
@@ -149,8 +149,39 @@ int u_copy_chars(char *dst, const char *src, int *width);
*/
int u_skip_chars(const char *str, int *width);

-int u_strcasecmp(const char *a, const char *b);
-int u_strncasecmp(const char *a, const char *b, int len);
+/*
+ * @str valid null-terminated UTF-8 string
+ *
+ * Converts a string into a form that is independent of case.
+ *
+ * Returns a newly allocated string
+ */
+char *u_casefold(const char *str);
+
+/*
+ * @str1 valid, normalized, null-terminated UTF-8 string
+ * @str2 valid, normalized, null-terminated UTF-8 string
+ *
+ * Returns 1 if @str1 is equal to @str2, ignoring the case of the characters.
+ */
+int u_strcase_equal(const char *str1, const char *str2);
+
+/*
+ * @str1 valid, normalized, null-terminated UTF-8 string
+ * @str2 valid, normalized, null-terminated UTF-8 string
+ * @len number of characters to consider for comparison
+ *
+ * Returns 1 if the first @len characters of @str1 and @str2 are equal,
+ * ignoring the case of the characters (0 otherwise).
+ */
+int u_strncase_equal(const char *str1, const char *str2, size_t len);
+
+/*
+ * @haystack valid, normalized, null-terminated UTF-8 string
+ * @needle valid, normalized, null-terminated UTF-8 string
+ *
+ * Returns position of @needle in @haystack (case insensitive comparison).
+ */
char *u_strcasestr(const char *haystack, const char *needle);

static inline char *u_strcasestr_filename(const char *haystack, const char *needle)
diff --git a/ui_curses.c b/ui_curses.c
index 0472aaa..fc36766 100644
--- a/ui_curses.c
+++ b/ui_curses.c
@@ -2245,6 +2245,7 @@ int main(int argc, char *argv[])
}

setlocale(LC_CTYPE, "");
+ setlocale(LC_COLLATE, "");
#ifdef CODESET
charset = nl_langinfo(CODESET);
#else
--
1.7.2.3
Gregory Petrosyan
2010-12-23 23:11:57 UTC
Permalink
Post by Johannes Weißl
+ if (ch < 0x0041)
+ return ch;
+ if (ch <= 0x005A)
+ return ch + 0x20;
Maybe use 'A' and 'Z' here? Just a stylistic choice, of course.

That's all the critic I have for this patch left, sorry :-)

Gregory
Johannes Weißl
2010-12-24 10:27:32 UTC
Permalink
Post by Gregory Petrosyan
Post by Johannes Weißl
+ if (ch < 0x0041)
+ return ch;
+ if (ch <= 0x005A)
+ return ch + 0x20;
Maybe use 'A' and 'Z' here? Just a stylistic choice, of course.
That's all the critic I have for this patch left, sorry :-)
If I recall correctly, the C standard is not tied to ASCII, so this
wouldn't be 100% pedantically correct :-).


Johannes
Johannes Weißl
2010-12-23 13:45:53 UTC
Permalink
e.g. searching for Bjork finds Björk, or searching for
Trentemöller finds Trentemøller
---
glob.c | 4 +-
scripts/gen_decomp.py | 159 +++++++++
track_info.c | 8 +-
uchar.c | 57 +++-
uchar.h | 19 ++
unidecomp.h | 855 +++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 1093 insertions(+), 9 deletions(-)
create mode 100755 scripts/gen_decomp.py
create mode 100644 unidecomp.h

diff --git a/glob.c b/glob.c
index b4b0627..f75a8a4 100644
--- a/glob.c
+++ b/glob.c
@@ -179,7 +179,7 @@ static int do_glob_match(struct list_head *head, struct list_head *first, const
if (gitem->type == GLOB_TEXT) {
int len = u_strlen(gitem->text);

- if (!u_strncase_equal(gitem->text, text, len))
+ if (!u_strncase_equal_base(gitem->text, text, len))
return 0;
text += strlen(gitem->text);
} else if (gitem->type == GLOB_QMARK) {
@@ -212,7 +212,7 @@ static int do_glob_match(struct list_head *head, struct list_head *first, const
while (1) {
const char *pos;

- pos = u_strcasestr(text, t);
+ pos = u_strcasestr_base(text, t);
if (pos == NULL)
return 0;
if (do_glob_match(head, next->next, pos + tlen))
diff --git a/scripts/gen_decomp.py b/scripts/gen_decomp.py
new file mode 100755
index 0000000..b27ec9b
--- /dev/null
+++ b/scripts/gen_decomp.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright 2010 Johannes Weißl
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+
+import sys
+import re
+import os.path
+import urllib2
+from optparse import OptionParser
+
+# Some letters don't have a decomposition, but can't be composed on all
+# keyboards. This dictionary maps them to an ASCII character which
+# *looks* similar.
+special_decompositions = {
+ u'Æ': u'A',
+ u'Ð': u'D',
+ u'×': u'x',
+ u'Ø': u'O',
+ u'Þ': u'P',
+ u'ß': u'B',
+ u'æ': u'a',
+ u'ð': u'd',
+ u'ø': u'o',
+ u'þ': u'p',
+}
+
+def parse_unidata(f):
+ u = {}
+ for line in f:
+ d = line.rstrip('\n').split(';')
+ cp = int(d[0], 16)
+ u[cp] = {}
+ u[cp]['name'] = d[1]
+ decomp = d[5]
+ if decomp:
+ m = re.match(r'<.*> (.*)', decomp)
+ u[cp]['compat'] = bool(m)
+ if m:
+ decomp = m.group(1)
+ u[cp]['decomp'] = [int(x, 16) for x in decomp.split(' ')]
+ else:
+ u[cp]['decomp'] = []
+ return u
+
+def unidata_expand_decomp(unidata):
+ def recurse(k):
+ if k not in unidata or not unidata[k]['decomp']:
+ return [k]
+ exp = []
+ for d in unidata[k]['decomp']:
+ exp += recurse(d)
+ return exp
+ for k in unidata.keys():
+ exp = recurse(k)
+ if exp != [k]:
+ unidata[k]['decomp'] = exp
+
+def unidata_add_mapping(unidata, mapping):
+ for k, v in mapping.items():
+ if ord(k) not in unidata:
+ unidata[ord(k)] = {'decomp': [ord(v)]}
+
+def is_diacritical_mark(c):
+ return c >= 0x0300 and c <= 0x036F
+
+def filter_unidata(unidata):
+ for k, v in unidata.items():
+ if not v['decomp']:
+ del unidata[k]
+ continue
+ b = v['decomp'][0]
+ if unichr(b) == u' ' or is_diacritical_mark(b):
+ del unidata[k]
+ continue
+ has_accents = False
+ for d in v['decomp'][1:]:
+ if is_diacritical_mark(d):
+ has_accents = True
+ break
+ if not has_accents:
+ del unidata[k]
+
+def output(unidata, f):
+ buf = '''/* This file is automatically generated. DO NOT EDIT!
+Instead, edit %s and re-run. */
+
+static struct {
+ uchar composed;
+ uchar base;
+} unidecomp_map[] = {
+''' % os.path.basename(sys.argv[0])
+ for k in sorted(unidata.keys()):
+ b = unidata[k]['decomp'][0]
+ buf += '\t{ %#6x, %#6x },\t// %s -> %s,\t%s\n' % \
+ (k, b,
+ unichr(k).encode('utf-8'),
+ unichr(b).encode('utf-8'),
+ ', '.join([' %s (%x)' %
+ (unichr(d).encode('utf-8'), d)
+ for d in unidata[k]['decomp'][1:]]))
+ buf += '};'
+ f.write(buf+'\n')
+
+def main(argv=None):
+
+ if not argv:
+ argv = sys.argv
+
+ parser = OptionParser(usage='usage: %prog [-w] [-o unidecomp.h]')
+ parser.add_option('-w', '--wget', action='store_true',
+ help='get unicode data from unicode.org')
+ parser.add_option('-o', '--output',
+ help='output file, default stdout')
+ (options, args) = parser.parse_args(argv[1:])
+
+ urlbase = 'http://unicode.org/Public/UNIDATA/'
+ unidata_filename = 'UnicodeData.txt'
+
+ if not os.path.exists(unidata_filename) and not options.wget:
+ parser.error('''need %s in the current directory, download
+from unicode.org or use `--wget' option.''' % unidata_filename)
+
+ if options.wget:
+ unidata_file = urllib2.urlopen(urlbase+unidata_filename)
+ else:
+ unidata_file = open(unidata_filename, 'rb')
+
+ unidata = parse_unidata(unidata_file)
+ unidata_file.close()
+
+ unidata_expand_decomp(unidata)
+ filter_unidata(unidata)
+ unidata_add_mapping(unidata, special_decompositions)
+
+ outfile = sys.stdout
+ if options.output:
+ outfile = open(options.output, 'wb')
+ output(unidata, outfile)
+ if options.output:
+ outfile.close()
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/track_info.c b/track_info.c
index 6a519c2..38763f0 100644
--- a/track_info.c
+++ b/track_info.c
@@ -87,13 +87,13 @@ int track_info_matches(struct track_info *ti, const char *text, unsigned int fla
(flags & TI_MATCH_ALBUM && album) ||
(flags & TI_MATCH_TITLE && title) ||
(flags & TI_MATCH_ALBUMARTIST && albumartist)) {
- if (flags & TI_MATCH_ARTIST && artist && u_strcasestr(artist, word))
+ if (flags & TI_MATCH_ARTIST && artist && u_strcasestr_base(artist, word))
continue;
- if (flags & TI_MATCH_ALBUM && album && u_strcasestr(album, word))
+ if (flags & TI_MATCH_ALBUM && album && u_strcasestr_base(album, word))
continue;
- if (flags & TI_MATCH_TITLE && title && u_strcasestr(title, word))
+ if (flags & TI_MATCH_TITLE && title && u_strcasestr_base(title, word))
continue;
- if (flags & TI_MATCH_ALBUMARTIST && albumartist && u_strcasestr(albumartist, word))
+ if (flags & TI_MATCH_ALBUMARTIST && albumartist && u_strcasestr_base(albumartist, word))
continue;
} else {
/* compare with url or filename without path */
diff --git a/uchar.c b/uchar.c
index 5df3079..8489f52 100644
--- a/uchar.c
+++ b/uchar.c
@@ -20,6 +20,7 @@
#include "uchar.h"
#include "compiler.h"
#include "gbuf.h"
+#include "utils.h" /* N_ELEMENTS */

#include <stdlib.h>
#include <string.h>
@@ -27,6 +28,8 @@
#include <wctype.h>
#include <ctype.h>

+#include "unidecomp.h"
+
const char hex_tab[16] = "0123456789abcdef";

/*
@@ -514,7 +517,30 @@ int u_strcase_equal(const char *a, const char *b)
return b[bi] ? 0 : 1;
}

-int u_strncase_equal(const char *a, const char *b, size_t len)
+static uchar get_base_from_composed(uchar ch)
+{
+ int begin = 0;
+ int end = N_ELEMENTS(unidecomp_map);
+
+ if (ch < unidecomp_map[begin].composed || ch > unidecomp_map[end - 1].composed)
+ return ch;
+
+ /* binary search */
+ while (1) {
+ int half = (begin + end) / 2;
+ if (ch == unidecomp_map[half].composed)
+ return unidecomp_map[half].base;
+ else if (half == begin)
+ break;
+ else if (ch > unidecomp_map[half].composed)
+ begin = half;
+ else
+ end = half;
+ }
+ return ch;
+}
+
+static inline int do_u_strncase_equal(const char *a, const char *b, size_t len, int only_base_chars)
{
int ai = 0, bi = 0;

@@ -524,6 +550,11 @@ int u_strncase_equal(const char *a, const char *b, size_t len)
u_get_char(a, &ai, &au);
u_get_char(b, &bi, &bu);

+ if (only_base_chars) {
+ au = get_base_from_composed(au);
+ bu = get_base_from_composed(bu);
+ }
+
if (u_casefold_char(au) != u_casefold_char(bu))
return 0;

@@ -533,7 +564,17 @@ int u_strncase_equal(const char *a, const char *b, size_t len)
return 1;
}

-char *u_strcasestr(const char *haystack, const char *needle)
+int u_strncase_equal(const char *a, const char *b, size_t len)
+{
+ return do_u_strncase_equal(a, b, len, 0);
+}
+
+int u_strncase_equal_base(const char *a, const char *b, size_t len)
+{
+ return do_u_strncase_equal(a, b, len, 1);
+}
+
+static inline char *do_u_strcasestr(const char *haystack, const char *needle, int only_base_chars)
{
/* strlen is faster and works here */
int haystack_len = strlen(haystack);
@@ -545,7 +586,7 @@ char *u_strcasestr(const char *haystack, const char *needle)

if (haystack_len < needle_len)
return NULL;
- if (u_strncase_equal(needle, haystack, needle_len))
+ if (do_u_strncase_equal(needle, haystack, needle_len, only_base_chars))
return (char *)haystack;

/* skip one char */
@@ -555,3 +596,13 @@ char *u_strcasestr(const char *haystack, const char *needle)
haystack_len -= idx;
} while (1);
}
+
+char *u_strcasestr(const char *haystack, const char *needle)
+{
+ return do_u_strcasestr(haystack, needle, 0);
+}
+
+char *u_strcasestr_base(const char *haystack, const char *needle)
+{
+ return do_u_strcasestr(haystack, needle, 1);
+}
diff --git a/uchar.h b/uchar.h
index 21cd317..fed36a5 100644
--- a/uchar.h
+++ b/uchar.h
@@ -177,6 +177,16 @@ int u_strcase_equal(const char *str1, const char *str2);
int u_strncase_equal(const char *str1, const char *str2, size_t len);

/*
+ * @str1 valid, normalized, null-terminated UTF-8 string
+ * @str2 valid, normalized, null-terminated UTF-8 string
+ * @len number of characters to consider for comparison
+ *
+ * Like u_strncase_equal(), but uses only base characters for comparison
+ * (e.g. "Trentemöller" matches "Trentemøller")
+ */
+int u_strncase_equal_base(const char *str1, const char *str2, size_t len);
+
+/*
* @haystack valid, normalized, null-terminated UTF-8 string
* @needle valid, normalized, null-terminated UTF-8 string
*
@@ -184,6 +194,15 @@ int u_strncase_equal(const char *str1, const char *str2, size_t len);
*/
char *u_strcasestr(const char *haystack, const char *needle);

+/*
+ * @haystack valid, normalized, null-terminated UTF-8 string
+ * @needle valid, normalized, null-terminated UTF-8 string
+ *
+ * Like u_strcasestr(), but uses only base characters for comparison
+ * (e.g. "Trentemöller" matches "Trentemøller")
+ */
+char *u_strcasestr_base(const char *haystack, const char *needle);
+
static inline char *u_strcasestr_filename(const char *haystack, const char *needle)
{
return u_strcasestr(haystack, needle);
diff --git a/unidecomp.h b/unidecomp.h
new file mode 100644
index 0000000..c0aef52
--- /dev/null
+++ b/unidecomp.h
@@ -0,0 +1,855 @@
+/* This file is automatically generated. DO NOT EDIT!
+Instead, edit gen_decomp.py and re-run. */
+
+static struct {
+ uchar composed;
+ uchar base;
+} unidecomp_map[] = {
+ { 0xc0, 0x41 }, // À -> A, ̀ (300)
+ { 0xc1, 0x41 }, // Á -> A, ́ (301)
+ { 0xc2, 0x41 }, // Â -> A, ̂ (302)
+ { 0xc3, 0x41 }, // Ã -> A, ̃ (303)
+ { 0xc4, 0x41 }, // Ä -> A, ̈ (308)
+ { 0xc5, 0x41 }, // Å -> A, ̊ (30a)
+ { 0xc6, 0x41 }, // Æ -> A,
+ { 0xc7, 0x43 }, // Ç -> C, ̧ (327)
+ { 0xc8, 0x45 }, // È -> E, ̀ (300)
+ { 0xc9, 0x45 }, // É -> E, ́ (301)
+ { 0xca, 0x45 }, // Ê -> E, ̂ (302)
+ { 0xcb, 0x45 }, // Ë -> E, ̈ (308)
+ { 0xcc, 0x49 }, // Ì -> I, ̀ (300)
+ { 0xcd, 0x49 }, // Í -> I, ́ (301)
+ { 0xce, 0x49 }, // Î -> I, ̂ (302)
+ { 0xcf, 0x49 }, // Ï -> I, ̈ (308)
+ { 0xd0, 0x44 }, // Ð -> D,
+ { 0xd1, 0x4e }, // Ñ -> N, ̃ (303)
+ { 0xd2, 0x4f }, // Ò -> O, ̀ (300)
+ { 0xd3, 0x4f }, // Ó -> O, ́ (301)
+ { 0xd4, 0x4f }, // Ô -> O, ̂ (302)
+ { 0xd5, 0x4f }, // Õ -> O, ̃ (303)
+ { 0xd6, 0x4f }, // Ö -> O, ̈ (308)
+ { 0xd7, 0x78 }, // × -> x,
+ { 0xd8, 0x4f }, // Ø -> O,
+ { 0xd9, 0x55 }, // Ù -> U, ̀ (300)
+ { 0xda, 0x55 }, // Ú -> U, ́ (301)
+ { 0xdb, 0x55 }, // Û -> U, ̂ (302)
+ { 0xdc, 0x55 }, // Ü -> U, ̈ (308)
+ { 0xdd, 0x59 }, // Ý -> Y, ́ (301)
+ { 0xde, 0x50 }, // Þ -> P,
+ { 0xdf, 0x42 }, // ß -> B,
+ { 0xe0, 0x61 }, // à -> a, ̀ (300)
+ { 0xe1, 0x61 }, // á -> a, ́ (301)
+ { 0xe2, 0x61 }, // â -> a, ̂ (302)
+ { 0xe3, 0x61 }, // ã -> a, ̃ (303)
+ { 0xe4, 0x61 }, // ä -> a, ̈ (308)
+ { 0xe5, 0x61 }, // å -> a, ̊ (30a)
+ { 0xe6, 0x61 }, // æ -> a,
+ { 0xe7, 0x63 }, // ç -> c, ̧ (327)
+ { 0xe8, 0x65 }, // è -> e, ̀ (300)
+ { 0xe9, 0x65 }, // é -> e, ́ (301)
+ { 0xea, 0x65 }, // ê -> e, ̂ (302)
+ { 0xeb, 0x65 }, // ë -> e, ̈ (308)
+ { 0xec, 0x69 }, // ì -> i, ̀ (300)
+ { 0xed, 0x69 }, // í -> i, ́ (301)
+ { 0xee, 0x69 }, // î -> i, ̂ (302)
+ { 0xef, 0x69 }, // ï -> i, ̈ (308)
+ { 0xf0, 0x64 }, // ð -> d,
+ { 0xf1, 0x6e }, // ñ -> n, ̃ (303)
+ { 0xf2, 0x6f }, // ò -> o, ̀ (300)
+ { 0xf3, 0x6f }, // ó -> o, ́ (301)
+ { 0xf4, 0x6f }, // ô -> o, ̂ (302)
+ { 0xf5, 0x6f }, // õ -> o, ̃ (303)
+ { 0xf6, 0x6f }, // ö -> o, ̈ (308)
+ { 0xf8, 0x6f }, // ø -> o,
+ { 0xf9, 0x75 }, // ù -> u, ̀ (300)
+ { 0xfa, 0x75 }, // ú -> u, ́ (301)
+ { 0xfb, 0x75 }, // û -> u, ̂ (302)
+ { 0xfc, 0x75 }, // ü -> u, ̈ (308)
+ { 0xfd, 0x79 }, // ý -> y, ́ (301)
+ { 0xfe, 0x70 }, // þ -> p,
+ { 0xff, 0x79 }, // ÿ -> y, ̈ (308)
+ { 0x100, 0x41 }, // Ā -> A, ̄ (304)
+ { 0x101, 0x61 }, // ā -> a, ̄ (304)
+ { 0x102, 0x41 }, // Ă -> A, ̆ (306)
+ { 0x103, 0x61 }, // ă -> a, ̆ (306)
+ { 0x104, 0x41 }, // Ą -> A, ̨ (328)
+ { 0x105, 0x61 }, // ą -> a, ̨ (328)
+ { 0x106, 0x43 }, // Ć -> C, ́ (301)
+ { 0x107, 0x63 }, // ć -> c, ́ (301)
+ { 0x108, 0x43 }, // Ĉ -> C, ̂ (302)
+ { 0x109, 0x63 }, // ĉ -> c, ̂ (302)
+ { 0x10a, 0x43 }, // Ċ -> C, ̇ (307)
+ { 0x10b, 0x63 }, // ċ -> c, ̇ (307)
+ { 0x10c, 0x43 }, // Č -> C, ̌ (30c)
+ { 0x10d, 0x63 }, // č -> c, ̌ (30c)
+ { 0x10e, 0x44 }, // Ď -> D, ̌ (30c)
+ { 0x10f, 0x64 }, // ď -> d, ̌ (30c)
+ { 0x112, 0x45 }, // Ē -> E, ̄ (304)
+ { 0x113, 0x65 }, // ē -> e, ̄ (304)
+ { 0x114, 0x45 }, // Ĕ -> E, ̆ (306)
+ { 0x115, 0x65 }, // ĕ -> e, ̆ (306)
+ { 0x116, 0x45 }, // Ė -> E, ̇ (307)
+ { 0x117, 0x65 }, // ė -> e, ̇ (307)
+ { 0x118, 0x45 }, // Ę -> E, ̨ (328)
+ { 0x119, 0x65 }, // ę -> e, ̨ (328)
+ { 0x11a, 0x45 }, // Ě -> E, ̌ (30c)
+ { 0x11b, 0x65 }, // ě -> e, ̌ (30c)
+ { 0x11c, 0x47 }, // Ĝ -> G, ̂ (302)
+ { 0x11d, 0x67 }, // ĝ -> g, ̂ (302)
+ { 0x11e, 0x47 }, // Ğ -> G, ̆ (306)
+ { 0x11f, 0x67 }, // ğ -> g, ̆ (306)
+ { 0x120, 0x47 }, // Ġ -> G, ̇ (307)
+ { 0x121, 0x67 }, // ġ -> g, ̇ (307)
+ { 0x122, 0x47 }, // Ģ -> G, ̧ (327)
+ { 0x123, 0x67 }, // ģ -> g, ̧ (327)
+ { 0x124, 0x48 }, // Ĥ -> H, ̂ (302)
+ { 0x125, 0x68 }, // ĥ -> h, ̂ (302)
+ { 0x128, 0x49 }, // Ĩ -> I, ̃ (303)
+ { 0x129, 0x69 }, // ĩ -> i, ̃ (303)
+ { 0x12a, 0x49 }, // Ī -> I, ̄ (304)
+ { 0x12b, 0x69 }, // ī -> i, ̄ (304)
+ { 0x12c, 0x49 }, // Ĭ -> I, ̆ (306)
+ { 0x12d, 0x69 }, // ĭ -> i, ̆ (306)
+ { 0x12e, 0x49 }, // Į -> I, ̨ (328)
+ { 0x12f, 0x69 }, // į -> i, ̨ (328)
+ { 0x130, 0x49 }, // İ -> I, ̇ (307)
+ { 0x134, 0x4a }, // Ĵ -> J, ̂ (302)
+ { 0x135, 0x6a }, // ĵ -> j, ̂ (302)
+ { 0x136, 0x4b }, // Ķ -> K, ̧ (327)
+ { 0x137, 0x6b }, // ķ -> k, ̧ (327)
+ { 0x139, 0x4c }, // Ĺ -> L, ́ (301)
+ { 0x13a, 0x6c }, // ĺ -> l, ́ (301)
+ { 0x13b, 0x4c }, // Ļ -> L, ̧ (327)
+ { 0x13c, 0x6c }, // ļ -> l, ̧ (327)
+ { 0x13d, 0x4c }, // Ľ -> L, ̌ (30c)
+ { 0x13e, 0x6c }, // ľ -> l, ̌ (30c)
+ { 0x143, 0x4e }, // Ń -> N, ́ (301)
+ { 0x144, 0x6e }, // ń -> n, ́ (301)
+ { 0x145, 0x4e }, // Ņ -> N, ̧ (327)
+ { 0x146, 0x6e }, // ņ -> n, ̧ (327)
+ { 0x147, 0x4e }, // Ň -> N, ̌ (30c)
+ { 0x148, 0x6e }, // ň -> n, ̌ (30c)
+ { 0x14c, 0x4f }, // Ō -> O, ̄ (304)
+ { 0x14d, 0x6f }, // ō -> o, ̄ (304)
+ { 0x14e, 0x4f }, // Ŏ -> O, ̆ (306)
+ { 0x14f, 0x6f }, // ŏ -> o, ̆ (306)
+ { 0x150, 0x4f }, // Ő -> O, ̋ (30b)
+ { 0x151, 0x6f }, // ő -> o, ̋ (30b)
+ { 0x154, 0x52 }, // Ŕ -> R, ́ (301)
+ { 0x155, 0x72 }, // ŕ -> r, ́ (301)
+ { 0x156, 0x52 }, // Ŗ -> R, ̧ (327)
+ { 0x157, 0x72 }, // ŗ -> r, ̧ (327)
+ { 0x158, 0x52 }, // Ř -> R, ̌ (30c)
+ { 0x159, 0x72 }, // ř -> r, ̌ (30c)
+ { 0x15a, 0x53 }, // Ś -> S, ́ (301)
+ { 0x15b, 0x73 }, // ś -> s, ́ (301)
+ { 0x15c, 0x53 }, // Ŝ -> S, ̂ (302)
+ { 0x15d, 0x73 }, // ŝ -> s, ̂ (302)
+ { 0x15e, 0x53 }, // Ş -> S, ̧ (327)
+ { 0x15f, 0x73 }, // ş -> s, ̧ (327)
+ { 0x160, 0x53 }, // Š -> S, ̌ (30c)
+ { 0x161, 0x73 }, // š -> s, ̌ (30c)
+ { 0x162, 0x54 }, // Ţ -> T, ̧ (327)
+ { 0x163, 0x74 }, // ţ -> t, ̧ (327)
+ { 0x164, 0x54 }, // Ť -> T, ̌ (30c)
+ { 0x165, 0x74 }, // ť -> t, ̌ (30c)
+ { 0x168, 0x55 }, // Ũ -> U, ̃ (303)
+ { 0x169, 0x75 }, // ũ -> u, ̃ (303)
+ { 0x16a, 0x55 }, // Ū -> U, ̄ (304)
+ { 0x16b, 0x75 }, // ū -> u, ̄ (304)
+ { 0x16c, 0x55 }, // Ŭ -> U, ̆ (306)
+ { 0x16d, 0x75 }, // ŭ -> u, ̆ (306)
+ { 0x16e, 0x55 }, // Ů -> U, ̊ (30a)
+ { 0x16f, 0x75 }, // ů -> u, ̊ (30a)
+ { 0x170, 0x55 }, // Ű -> U, ̋ (30b)
+ { 0x171, 0x75 }, // ű -> u, ̋ (30b)
+ { 0x172, 0x55 }, // Ų -> U, ̨ (328)
+ { 0x173, 0x75 }, // ų -> u, ̨ (328)
+ { 0x174, 0x57 }, // Ŵ -> W, ̂ (302)
+ { 0x175, 0x77 }, // ŵ -> w, ̂ (302)
+ { 0x176, 0x59 }, // Ŷ -> Y, ̂ (302)
+ { 0x177, 0x79 }, // ŷ -> y, ̂ (302)
+ { 0x178, 0x59 }, // Ÿ -> Y, ̈ (308)
+ { 0x179, 0x5a }, // Ź -> Z, ́ (301)
+ { 0x17a, 0x7a }, // ź -> z, ́ (301)
+ { 0x17b, 0x5a }, // Ż -> Z, ̇ (307)
+ { 0x17c, 0x7a }, // ż -> z, ̇ (307)
+ { 0x17d, 0x5a }, // Ž -> Z, ̌ (30c)
+ { 0x17e, 0x7a }, // ž -> z, ̌ (30c)
+ { 0x1a0, 0x4f }, // Ơ -> O, ̛ (31b)
+ { 0x1a1, 0x6f }, // ơ -> o, ̛ (31b)
+ { 0x1af, 0x55 }, // Ư -> U, ̛ (31b)
+ { 0x1b0, 0x75 }, // ư -> u, ̛ (31b)
+ { 0x1c4, 0x44 }, // DŽ -> D, Z (5a), ̌ (30c)
+ { 0x1c5, 0x44 }, // Dž -> D, z (7a), ̌ (30c)
+ { 0x1c6, 0x64 }, // dž -> d, z (7a), ̌ (30c)
+ { 0x1cd, 0x41 }, // Ǎ -> A, ̌ (30c)
+ { 0x1ce, 0x61 }, // ǎ -> a, ̌ (30c)
+ { 0x1cf, 0x49 }, // Ǐ -> I, ̌ (30c)
+ { 0x1d0, 0x69 }, // ǐ -> i, ̌ (30c)
+ { 0x1d1, 0x4f }, // Ǒ -> O, ̌ (30c)
+ { 0x1d2, 0x6f }, // ǒ -> o, ̌ (30c)
+ { 0x1d3, 0x55 }, // Ǔ -> U, ̌ (30c)
+ { 0x1d4, 0x75 }, // ǔ -> u, ̌ (30c)
+ { 0x1d5, 0x55 }, // Ǖ -> U, ̈ (308), ̄ (304)
+ { 0x1d6, 0x75 }, // ǖ -> u, ̈ (308), ̄ (304)
+ { 0x1d7, 0x55 }, // Ǘ -> U, ̈ (308), ́ (301)
+ { 0x1d8, 0x75 }, // ǘ -> u, ̈ (308), ́ (301)
+ { 0x1d9, 0x55 }, // Ǚ -> U, ̈ (308), ̌ (30c)
+ { 0x1da, 0x75 }, // ǚ -> u, ̈ (308), ̌ (30c)
+ { 0x1db, 0x55 }, // Ǜ -> U, ̈ (308), ̀ (300)
+ { 0x1dc, 0x75 }, // ǜ -> u, ̈ (308), ̀ (300)
+ { 0x1de, 0x41 }, // Ǟ -> A, ̈ (308), ̄ (304)
+ { 0x1df, 0x61 }, // ǟ -> a, ̈ (308), ̄ (304)
+ { 0x1e0, 0x41 }, // Ǡ -> A, ̇ (307), ̄ (304)
+ { 0x1e1, 0x61 }, // ǡ -> a, ̇ (307), ̄ (304)
+ { 0x1e2, 0xc6 }, // Ǣ -> Æ, ̄ (304)
+ { 0x1e3, 0xe6 }, // ǣ -> æ, ̄ (304)
+ { 0x1e6, 0x47 }, // Ǧ -> G, ̌ (30c)
+ { 0x1e7, 0x67 }, // ǧ -> g, ̌ (30c)
+ { 0x1e8, 0x4b }, // Ǩ -> K, ̌ (30c)
+ { 0x1e9, 0x6b }, // ǩ -> k, ̌ (30c)
+ { 0x1ea, 0x4f }, // Ǫ -> O, ̨ (328)
+ { 0x1eb, 0x6f }, // ǫ -> o, ̨ (328)
+ { 0x1ec, 0x4f }, // Ǭ -> O, ̨ (328), ̄ (304)
+ { 0x1ed, 0x6f }, // ǭ -> o, ̨ (328), ̄ (304)
+ { 0x1ee, 0x1b7 }, // Ǯ -> Ʒ, ̌ (30c)
+ { 0x1ef, 0x292 }, // ǯ -> ʒ, ̌ (30c)
+ { 0x1f0, 0x6a }, // ǰ -> j, ̌ (30c)
+ { 0x1f4, 0x47 }, // Ǵ -> G, ́ (301)
+ { 0x1f5, 0x67 }, // ǵ -> g, ́ (301)
+ { 0x1f8, 0x4e }, // Ǹ -> N, ̀ (300)
+ { 0x1f9, 0x6e }, // ǹ -> n, ̀ (300)
+ { 0x1fa, 0x41 }, // Ǻ -> A, ̊ (30a), ́ (301)
+ { 0x1fb, 0x61 }, // ǻ -> a, ̊ (30a), ́ (301)
+ { 0x1fc, 0xc6 }, // Ǽ -> Æ, ́ (301)
+ { 0x1fd, 0xe6 }, // ǽ -> æ, ́ (301)
+ { 0x1fe, 0xd8 }, // Ǿ -> Ø, ́ (301)
+ { 0x1ff, 0xf8 }, // ǿ -> ø, ́ (301)
+ { 0x200, 0x41 }, // Ȁ -> A, ̏ (30f)
+ { 0x201, 0x61 }, // ȁ -> a, ̏ (30f)
+ { 0x202, 0x41 }, // Ȃ -> A, ̑ (311)
+ { 0x203, 0x61 }, // ȃ -> a, ̑ (311)
+ { 0x204, 0x45 }, // Ȅ -> E, ̏ (30f)
+ { 0x205, 0x65 }, // ȅ -> e, ̏ (30f)
+ { 0x206, 0x45 }, // Ȇ -> E, ̑ (311)
+ { 0x207, 0x65 }, // ȇ -> e, ̑ (311)
+ { 0x208, 0x49 }, // Ȉ -> I, ̏ (30f)
+ { 0x209, 0x69 }, // ȉ -> i, ̏ (30f)
+ { 0x20a, 0x49 }, // Ȋ -> I, ̑ (311)
+ { 0x20b, 0x69 }, // ȋ -> i, ̑ (311)
+ { 0x20c, 0x4f }, // Ȍ -> O, ̏ (30f)
+ { 0x20d, 0x6f }, // ȍ -> o, ̏ (30f)
+ { 0x20e, 0x4f }, // Ȏ -> O, ̑ (311)
+ { 0x20f, 0x6f }, // ȏ -> o, ̑ (311)
+ { 0x210, 0x52 }, // Ȑ -> R, ̏ (30f)
+ { 0x211, 0x72 }, // ȑ -> r, ̏ (30f)
+ { 0x212, 0x52 }, // Ȓ -> R, ̑ (311)
+ { 0x213, 0x72 }, // ȓ -> r, ̑ (311)
+ { 0x214, 0x55 }, // Ȕ -> U, ̏ (30f)
+ { 0x215, 0x75 }, // ȕ -> u, ̏ (30f)
+ { 0x216, 0x55 }, // Ȗ -> U, ̑ (311)
+ { 0x217, 0x75 }, // ȗ -> u, ̑ (311)
+ { 0x218, 0x53 }, // Ș -> S, ̦ (326)
+ { 0x219, 0x73 }, // ș -> s, ̦ (326)
+ { 0x21a, 0x54 }, // Ț -> T, ̦ (326)
+ { 0x21b, 0x74 }, // ț -> t, ̦ (326)
+ { 0x21e, 0x48 }, // Ȟ -> H, ̌ (30c)
+ { 0x21f, 0x68 }, // ȟ -> h, ̌ (30c)
+ { 0x226, 0x41 }, // Ȧ -> A, ̇ (307)
+ { 0x227, 0x61 }, // ȧ -> a, ̇ (307)
+ { 0x228, 0x45 }, // Ȩ -> E, ̧ (327)
+ { 0x229, 0x65 }, // ȩ -> e, ̧ (327)
+ { 0x22a, 0x4f }, // Ȫ -> O, ̈ (308), ̄ (304)
+ { 0x22b, 0x6f }, // ȫ -> o, ̈ (308), ̄ (304)
+ { 0x22c, 0x4f }, // Ȭ -> O, ̃ (303), ̄ (304)
+ { 0x22d, 0x6f }, // ȭ -> o, ̃ (303), ̄ (304)
+ { 0x22e, 0x4f }, // Ȯ -> O, ̇ (307)
+ { 0x22f, 0x6f }, // ȯ -> o, ̇ (307)
+ { 0x230, 0x4f }, // Ȱ -> O, ̇ (307), ̄ (304)
+ { 0x231, 0x6f }, // ȱ -> o, ̇ (307), ̄ (304)
+ { 0x232, 0x59 }, // Ȳ -> Y, ̄ (304)
+ { 0x233, 0x79 }, // ȳ -> y, ̄ (304)
+ { 0x386, 0x391 }, // Ά -> Α, ́ (301)
+ { 0x388, 0x395 }, // Έ -> Ε, ́ (301)
+ { 0x389, 0x397 }, // Ή -> Η, ́ (301)
+ { 0x38a, 0x399 }, // Ί -> Ι, ́ (301)
+ { 0x38c, 0x39f }, // Ό -> Ο, ́ (301)
+ { 0x38e, 0x3a5 }, // Ύ -> Υ, ́ (301)
+ { 0x38f, 0x3a9 }, // Ώ -> Ω, ́ (301)
+ { 0x390, 0x3b9 }, // ΐ -> ι, ̈ (308), ́ (301)
+ { 0x3aa, 0x399 }, // Ϊ -> Ι, ̈ (308)
+ { 0x3ab, 0x3a5 }, // Ϋ -> Υ, ̈ (308)
+ { 0x3ac, 0x3b1 }, // ά -> α, ́ (301)
+ { 0x3ad, 0x3b5 }, // έ -> ε, ́ (301)
+ { 0x3ae, 0x3b7 }, // ή -> η, ́ (301)
+ { 0x3af, 0x3b9 }, // ί -> ι, ́ (301)
+ { 0x3b0, 0x3c5 }, // ΰ -> υ, ̈ (308), ́ (301)
+ { 0x3ca, 0x3b9 }, // ϊ -> ι, ̈ (308)
+ { 0x3cb, 0x3c5 }, // ϋ -> υ, ̈ (308)
+ { 0x3cc, 0x3bf }, // ό -> ο, ́ (301)
+ { 0x3cd, 0x3c5 }, // ύ -> υ, ́ (301)
+ { 0x3ce, 0x3c9 }, // ώ -> ω, ́ (301)
+ { 0x3d3, 0x3a5 }, // ϓ -> Υ, ́ (301)
+ { 0x3d4, 0x3a5 }, // ϔ -> Υ, ̈ (308)
+ { 0x400, 0x415 }, // Ѐ -> Е, ̀ (300)
+ { 0x401, 0x415 }, // Ё -> Е, ̈ (308)
+ { 0x403, 0x413 }, // Ѓ -> Г, ́ (301)
+ { 0x407, 0x406 }, // Ї -> І, ̈ (308)
+ { 0x40c, 0x41a }, // Ќ -> К, ́ (301)
+ { 0x40d, 0x418 }, // Ѝ -> И, ̀ (300)
+ { 0x40e, 0x423 }, // Ў -> У, ̆ (306)
+ { 0x419, 0x418 }, // Й -> И, ̆ (306)
+ { 0x439, 0x438 }, // й -> и, ̆ (306)
+ { 0x450, 0x435 }, // ѐ -> е, ̀ (300)
+ { 0x451, 0x435 }, // ё -> е, ̈ (308)
+ { 0x453, 0x433 }, // ѓ -> г, ́ (301)
+ { 0x457, 0x456 }, // ї -> і, ̈ (308)
+ { 0x45c, 0x43a }, // ќ -> к, ́ (301)
+ { 0x45d, 0x438 }, // ѝ -> и, ̀ (300)
+ { 0x45e, 0x443 }, // ў -> у, ̆ (306)
+ { 0x476, 0x474 }, // Ѷ -> Ѵ, ̏ (30f)
+ { 0x477, 0x475 }, // ѷ -> ѵ, ̏ (30f)
+ { 0x4c1, 0x416 }, // Ӂ -> Ж, ̆ (306)
+ { 0x4c2, 0x436 }, // ӂ -> ж, ̆ (306)
+ { 0x4d0, 0x410 }, // Ӑ -> А, ̆ (306)
+ { 0x4d1, 0x430 }, // ӑ -> а, ̆ (306)
+ { 0x4d2, 0x410 }, // Ӓ -> А, ̈ (308)
+ { 0x4d3, 0x430 }, // ӓ -> а, ̈ (308)
+ { 0x4d6, 0x415 }, // Ӗ -> Е, ̆ (306)
+ { 0x4d7, 0x435 }, // ӗ -> е, ̆ (306)
+ { 0x4da, 0x4d8 }, // Ӛ -> Ә, ̈ (308)
+ { 0x4db, 0x4d9 }, // ӛ -> ә, ̈ (308)
+ { 0x4dc, 0x416 }, // Ӝ -> Ж, ̈ (308)
+ { 0x4dd, 0x436 }, // ӝ -> ж, ̈ (308)
+ { 0x4de, 0x417 }, // Ӟ -> З, ̈ (308)
+ { 0x4df, 0x437 }, // ӟ -> з, ̈ (308)
+ { 0x4e2, 0x418 }, // Ӣ -> И, ̄ (304)
+ { 0x4e3, 0x438 }, // ӣ -> и, ̄ (304)
+ { 0x4e4, 0x418 }, // Ӥ -> И, ̈ (308)
+ { 0x4e5, 0x438 }, // ӥ -> и, ̈ (308)
+ { 0x4e6, 0x41e }, // Ӧ -> О, ̈ (308)
+ { 0x4e7, 0x43e }, // ӧ -> о, ̈ (308)
+ { 0x4ea, 0x4e8 }, // Ӫ -> Ө, ̈ (308)
+ { 0x4eb, 0x4e9 }, // ӫ -> ө, ̈ (308)
+ { 0x4ec, 0x42d }, // Ӭ -> Э, ̈ (308)
+ { 0x4ed, 0x44d }, // ӭ -> э, ̈ (308)
+ { 0x4ee, 0x423 }, // Ӯ -> У, ̄ (304)
+ { 0x4ef, 0x443 }, // ӯ -> у, ̄ (304)
+ { 0x4f0, 0x423 }, // Ӱ -> У, ̈ (308)
+ { 0x4f1, 0x443 }, // ӱ -> у, ̈ (308)
+ { 0x4f2, 0x423 }, // Ӳ -> У, ̋ (30b)
+ { 0x4f3, 0x443 }, // ӳ -> у, ̋ (30b)
+ { 0x4f4, 0x427 }, // Ӵ -> Ч, ̈ (308)
+ { 0x4f5, 0x447 }, // ӵ -> ч, ̈ (308)
+ { 0x4f8, 0x42b }, // Ӹ -> Ы, ̈ (308)
+ { 0x4f9, 0x44b }, // ӹ -> ы, ̈ (308)
+ { 0x1e00, 0x41 }, // Ḁ -> A, ̥ (325)
+ { 0x1e01, 0x61 }, // ḁ -> a, ̥ (325)
+ { 0x1e02, 0x42 }, // Ḃ -> B, ̇ (307)
+ { 0x1e03, 0x62 }, // ḃ -> b, ̇ (307)
+ { 0x1e04, 0x42 }, // Ḅ -> B, ̣ (323)
+ { 0x1e05, 0x62 }, // ḅ -> b, ̣ (323)
+ { 0x1e06, 0x42 }, // Ḇ -> B, ̱ (331)
+ { 0x1e07, 0x62 }, // ḇ -> b, ̱ (331)
+ { 0x1e08, 0x43 }, // Ḉ -> C, ̧ (327), ́ (301)
+ { 0x1e09, 0x63 }, // ḉ -> c, ̧ (327), ́ (301)
+ { 0x1e0a, 0x44 }, // Ḋ -> D, ̇ (307)
+ { 0x1e0b, 0x64 }, // ḋ -> d, ̇ (307)
+ { 0x1e0c, 0x44 }, // Ḍ -> D, ̣ (323)
+ { 0x1e0d, 0x64 }, // ḍ -> d, ̣ (323)
+ { 0x1e0e, 0x44 }, // Ḏ -> D, ̱ (331)
+ { 0x1e0f, 0x64 }, // ḏ -> d, ̱ (331)
+ { 0x1e10, 0x44 }, // Ḑ -> D, ̧ (327)
+ { 0x1e11, 0x64 }, // ḑ -> d, ̧ (327)
+ { 0x1e12, 0x44 }, // Ḓ -> D, ̭ (32d)
+ { 0x1e13, 0x64 }, // ḓ -> d, ̭ (32d)
+ { 0x1e14, 0x45 }, // Ḕ -> E, ̄ (304), ̀ (300)
+ { 0x1e15, 0x65 }, // ḕ -> e, ̄ (304), ̀ (300)
+ { 0x1e16, 0x45 }, // Ḗ -> E, ̄ (304), ́ (301)
+ { 0x1e17, 0x65 }, // ḗ -> e, ̄ (304), ́ (301)
+ { 0x1e18, 0x45 }, // Ḙ -> E, ̭ (32d)
+ { 0x1e19, 0x65 }, // ḙ -> e, ̭ (32d)
+ { 0x1e1a, 0x45 }, // Ḛ -> E, ̰ (330)
+ { 0x1e1b, 0x65 }, // ḛ -> e, ̰ (330)
+ { 0x1e1c, 0x45 }, // Ḝ -> E, ̧ (327), ̆ (306)
+ { 0x1e1d, 0x65 }, // ḝ -> e, ̧ (327), ̆ (306)
+ { 0x1e1e, 0x46 }, // Ḟ -> F, ̇ (307)
+ { 0x1e1f, 0x66 }, // ḟ -> f, ̇ (307)
+ { 0x1e20, 0x47 }, // Ḡ -> G, ̄ (304)
+ { 0x1e21, 0x67 }, // ḡ -> g, ̄ (304)
+ { 0x1e22, 0x48 }, // Ḣ -> H, ̇ (307)
+ { 0x1e23, 0x68 }, // ḣ -> h, ̇ (307)
+ { 0x1e24, 0x48 }, // Ḥ -> H, ̣ (323)
+ { 0x1e25, 0x68 }, // ḥ -> h, ̣ (323)
+ { 0x1e26, 0x48 }, // Ḧ -> H, ̈ (308)
+ { 0x1e27, 0x68 }, // ḧ -> h, ̈ (308)
+ { 0x1e28, 0x48 }, // Ḩ -> H, ̧ (327)
+ { 0x1e29, 0x68 }, // ḩ -> h, ̧ (327)
+ { 0x1e2a, 0x48 }, // Ḫ -> H, ̮ (32e)
+ { 0x1e2b, 0x68 }, // ḫ -> h, ̮ (32e)
+ { 0x1e2c, 0x49 }, // Ḭ -> I, ̰ (330)
+ { 0x1e2d, 0x69 }, // ḭ -> i, ̰ (330)
+ { 0x1e2e, 0x49 }, // Ḯ -> I, ̈ (308), ́ (301)
+ { 0x1e2f, 0x69 }, // ḯ -> i, ̈ (308), ́ (301)
+ { 0x1e30, 0x4b }, // Ḱ -> K, ́ (301)
+ { 0x1e31, 0x6b }, // ḱ -> k, ́ (301)
+ { 0x1e32, 0x4b }, // Ḳ -> K, ̣ (323)
+ { 0x1e33, 0x6b }, // ḳ -> k, ̣ (323)
+ { 0x1e34, 0x4b }, // Ḵ -> K, ̱ (331)
+ { 0x1e35, 0x6b }, // ḵ -> k, ̱ (331)
+ { 0x1e36, 0x4c }, // Ḷ -> L, ̣ (323)
+ { 0x1e37, 0x6c }, // ḷ -> l, ̣ (323)
+ { 0x1e38, 0x4c }, // Ḹ -> L, ̣ (323), ̄ (304)
+ { 0x1e39, 0x6c }, // ḹ -> l, ̣ (323), ̄ (304)
+ { 0x1e3a, 0x4c }, // Ḻ -> L, ̱ (331)
+ { 0x1e3b, 0x6c }, // ḻ -> l, ̱ (331)
+ { 0x1e3c, 0x4c }, // Ḽ -> L, ̭ (32d)
+ { 0x1e3d, 0x6c }, // ḽ -> l, ̭ (32d)
+ { 0x1e3e, 0x4d }, // Ḿ -> M, ́ (301)
+ { 0x1e3f, 0x6d }, // ḿ -> m, ́ (301)
+ { 0x1e40, 0x4d }, // Ṁ -> M, ̇ (307)
+ { 0x1e41, 0x6d }, // ṁ -> m, ̇ (307)
+ { 0x1e42, 0x4d }, // Ṃ -> M, ̣ (323)
+ { 0x1e43, 0x6d }, // ṃ -> m, ̣ (323)
+ { 0x1e44, 0x4e }, // Ṅ -> N, ̇ (307)
+ { 0x1e45, 0x6e }, // ṅ -> n, ̇ (307)
+ { 0x1e46, 0x4e }, // Ṇ -> N, ̣ (323)
+ { 0x1e47, 0x6e }, // ṇ -> n, ̣ (323)
+ { 0x1e48, 0x4e }, // Ṉ -> N, ̱ (331)
+ { 0x1e49, 0x6e }, // ṉ -> n, ̱ (331)
+ { 0x1e4a, 0x4e }, // Ṋ -> N, ̭ (32d)
+ { 0x1e4b, 0x6e }, // ṋ -> n, ̭ (32d)
+ { 0x1e4c, 0x4f }, // Ṍ -> O, ̃ (303), ́ (301)
+ { 0x1e4d, 0x6f }, // ṍ -> o, ̃ (303), ́ (301)
+ { 0x1e4e, 0x4f }, // Ṏ -> O, ̃ (303), ̈ (308)
+ { 0x1e4f, 0x6f }, // ṏ -> o, ̃ (303), ̈ (308)
+ { 0x1e50, 0x4f }, // Ṑ -> O, ̄ (304), ̀ (300)
+ { 0x1e51, 0x6f }, // ṑ -> o, ̄ (304), ̀ (300)
+ { 0x1e52, 0x4f }, // Ṓ -> O, ̄ (304), ́ (301)
+ { 0x1e53, 0x6f }, // ṓ -> o, ̄ (304), ́ (301)
+ { 0x1e54, 0x50 }, // Ṕ -> P, ́ (301)
+ { 0x1e55, 0x70 }, // ṕ -> p, ́ (301)
+ { 0x1e56, 0x50 }, // Ṗ -> P, ̇ (307)
+ { 0x1e57, 0x70 }, // ṗ -> p, ̇ (307)
+ { 0x1e58, 0x52 }, // Ṙ -> R, ̇ (307)
+ { 0x1e59, 0x72 }, // ṙ -> r, ̇ (307)
+ { 0x1e5a, 0x52 }, // Ṛ -> R, ̣ (323)
+ { 0x1e5b, 0x72 }, // ṛ -> r, ̣ (323)
+ { 0x1e5c, 0x52 }, // Ṝ -> R, ̣ (323), ̄ (304)
+ { 0x1e5d, 0x72 }, // ṝ -> r, ̣ (323), ̄ (304)
+ { 0x1e5e, 0x52 }, // Ṟ -> R, ̱ (331)
+ { 0x1e5f, 0x72 }, // ṟ -> r, ̱ (331)
+ { 0x1e60, 0x53 }, // Ṡ -> S, ̇ (307)
+ { 0x1e61, 0x73 }, // ṡ -> s, ̇ (307)
+ { 0x1e62, 0x53 }, // Ṣ -> S, ̣ (323)
+ { 0x1e63, 0x73 }, // ṣ -> s, ̣ (323)
+ { 0x1e64, 0x53 }, // Ṥ -> S, ́ (301), ̇ (307)
+ { 0x1e65, 0x73 }, // ṥ -> s, ́ (301), ̇ (307)
+ { 0x1e66, 0x53 }, // Ṧ -> S, ̌ (30c), ̇ (307)
+ { 0x1e67, 0x73 }, // ṧ -> s, ̌ (30c), ̇ (307)
+ { 0x1e68, 0x53 }, // Ṩ -> S, ̣ (323), ̇ (307)
+ { 0x1e69, 0x73 }, // ṩ -> s, ̣ (323), ̇ (307)
+ { 0x1e6a, 0x54 }, // Ṫ -> T, ̇ (307)
+ { 0x1e6b, 0x74 }, // ṫ -> t, ̇ (307)
+ { 0x1e6c, 0x54 }, // Ṭ -> T, ̣ (323)
+ { 0x1e6d, 0x74 }, // ṭ -> t, ̣ (323)
+ { 0x1e6e, 0x54 }, // Ṯ -> T, ̱ (331)
+ { 0x1e6f, 0x74 }, // ṯ -> t, ̱ (331)
+ { 0x1e70, 0x54 }, // Ṱ -> T, ̭ (32d)
+ { 0x1e71, 0x74 }, // ṱ -> t, ̭ (32d)
+ { 0x1e72, 0x55 }, // Ṳ -> U, ̤ (324)
+ { 0x1e73, 0x75 }, // ṳ -> u, ̤ (324)
+ { 0x1e74, 0x55 }, // Ṵ -> U, ̰ (330)
+ { 0x1e75, 0x75 }, // ṵ -> u, ̰ (330)
+ { 0x1e76, 0x55 }, // Ṷ -> U, ̭ (32d)
+ { 0x1e77, 0x75 }, // ṷ -> u, ̭ (32d)
+ { 0x1e78, 0x55 }, // Ṹ -> U, ̃ (303), ́ (301)
+ { 0x1e79, 0x75 }, // ṹ -> u, ̃ (303), ́ (301)
+ { 0x1e7a, 0x55 }, // Ṻ -> U, ̄ (304), ̈ (308)
+ { 0x1e7b, 0x75 }, // ṻ -> u, ̄ (304), ̈ (308)
+ { 0x1e7c, 0x56 }, // Ṽ -> V, ̃ (303)
+ { 0x1e7d, 0x76 }, // ṽ -> v, ̃ (303)
+ { 0x1e7e, 0x56 }, // Ṿ -> V, ̣ (323)
+ { 0x1e7f, 0x76 }, // ṿ -> v, ̣ (323)
+ { 0x1e80, 0x57 }, // Ẁ -> W, ̀ (300)
+ { 0x1e81, 0x77 }, // ẁ -> w, ̀ (300)
+ { 0x1e82, 0x57 }, // Ẃ -> W, ́ (301)
+ { 0x1e83, 0x77 }, // ẃ -> w, ́ (301)
+ { 0x1e84, 0x57 }, // Ẅ -> W, ̈ (308)
+ { 0x1e85, 0x77 }, // ẅ -> w, ̈ (308)
+ { 0x1e86, 0x57 }, // Ẇ -> W, ̇ (307)
+ { 0x1e87, 0x77 }, // ẇ -> w, ̇ (307)
+ { 0x1e88, 0x57 }, // Ẉ -> W, ̣ (323)
+ { 0x1e89, 0x77 }, // ẉ -> w, ̣ (323)
+ { 0x1e8a, 0x58 }, // Ẋ -> X, ̇ (307)
+ { 0x1e8b, 0x78 }, // ẋ -> x, ̇ (307)
+ { 0x1e8c, 0x58 }, // Ẍ -> X, ̈ (308)
+ { 0x1e8d, 0x78 }, // ẍ -> x, ̈ (308)
+ { 0x1e8e, 0x59 }, // Ẏ -> Y, ̇ (307)
+ { 0x1e8f, 0x79 }, // ẏ -> y, ̇ (307)
+ { 0x1e90, 0x5a }, // Ẑ -> Z, ̂ (302)
+ { 0x1e91, 0x7a }, // ẑ -> z, ̂ (302)
+ { 0x1e92, 0x5a }, // Ẓ -> Z, ̣ (323)
+ { 0x1e93, 0x7a }, // ẓ -> z, ̣ (323)
+ { 0x1e94, 0x5a }, // Ẕ -> Z, ̱ (331)
+ { 0x1e95, 0x7a }, // ẕ -> z, ̱ (331)
+ { 0x1e96, 0x68 }, // ẖ -> h, ̱ (331)
+ { 0x1e97, 0x74 }, // ẗ -> t, ̈ (308)
+ { 0x1e98, 0x77 }, // ẘ -> w, ̊ (30a)
+ { 0x1e99, 0x79 }, // ẙ -> y, ̊ (30a)
+ { 0x1e9b, 0x73 }, // ẛ -> s, ̇ (307)
+ { 0x1ea0, 0x41 }, // Ạ -> A, ̣ (323)
+ { 0x1ea1, 0x61 }, // ạ -> a, ̣ (323)
+ { 0x1ea2, 0x41 }, // Ả -> A, ̉ (309)
+ { 0x1ea3, 0x61 }, // ả -> a, ̉ (309)
+ { 0x1ea4, 0x41 }, // Ấ -> A, ̂ (302), ́ (301)
+ { 0x1ea5, 0x61 }, // ấ -> a, ̂ (302), ́ (301)
+ { 0x1ea6, 0x41 }, // Ầ -> A, ̂ (302), ̀ (300)
+ { 0x1ea7, 0x61 }, // ầ -> a, ̂ (302), ̀ (300)
+ { 0x1ea8, 0x41 }, // Ẩ -> A, ̂ (302), ̉ (309)
+ { 0x1ea9, 0x61 }, // ẩ -> a, ̂ (302), ̉ (309)
+ { 0x1eaa, 0x41 }, // Ẫ -> A, ̂ (302), ̃ (303)
+ { 0x1eab, 0x61 }, // ẫ -> a, ̂ (302), ̃ (303)
+ { 0x1eac, 0x41 }, // Ậ -> A, ̣ (323), ̂ (302)
+ { 0x1ead, 0x61 }, // ậ -> a, ̣ (323), ̂ (302)
+ { 0x1eae, 0x41 }, // Ắ -> A, ̆ (306), ́ (301)
+ { 0x1eaf, 0x61 }, // ắ -> a, ̆ (306), ́ (301)
+ { 0x1eb0, 0x41 }, // Ằ -> A, ̆ (306), ̀ (300)
+ { 0x1eb1, 0x61 }, // ằ -> a, ̆ (306), ̀ (300)
+ { 0x1eb2, 0x41 }, // Ẳ -> A, ̆ (306), ̉ (309)
+ { 0x1eb3, 0x61 }, // ẳ -> a, ̆ (306), ̉ (309)
+ { 0x1eb4, 0x41 }, // Ẵ -> A, ̆ (306), ̃ (303)
+ { 0x1eb5, 0x61 }, // ẵ -> a, ̆ (306), ̃ (303)
+ { 0x1eb6, 0x41 }, // Ặ -> A, ̣ (323), ̆ (306)
+ { 0x1eb7, 0x61 }, // ặ -> a, ̣ (323), ̆ (306)
+ { 0x1eb8, 0x45 }, // Ẹ -> E, ̣ (323)
+ { 0x1eb9, 0x65 }, // ẹ -> e, ̣ (323)
+ { 0x1eba, 0x45 }, // Ẻ -> E, ̉ (309)
+ { 0x1ebb, 0x65 }, // ẻ -> e, ̉ (309)
+ { 0x1ebc, 0x45 }, // Ẽ -> E, ̃ (303)
+ { 0x1ebd, 0x65 }, // ẽ -> e, ̃ (303)
+ { 0x1ebe, 0x45 }, // Ế -> E, ̂ (302), ́ (301)
+ { 0x1ebf, 0x65 }, // ế -> e, ̂ (302), ́ (301)
+ { 0x1ec0, 0x45 }, // Ề -> E, ̂ (302), ̀ (300)
+ { 0x1ec1, 0x65 }, // ề -> e, ̂ (302), ̀ (300)
+ { 0x1ec2, 0x45 }, // Ể -> E, ̂ (302), ̉ (309)
+ { 0x1ec3, 0x65 }, // ể -> e, ̂ (302), ̉ (309)
+ { 0x1ec4, 0x45 }, // Ễ -> E, ̂ (302), ̃ (303)
+ { 0x1ec5, 0x65 }, // ễ -> e, ̂ (302), ̃ (303)
+ { 0x1ec6, 0x45 }, // Ệ -> E, ̣ (323), ̂ (302)
+ { 0x1ec7, 0x65 }, // ệ -> e, ̣ (323), ̂ (302)
+ { 0x1ec8, 0x49 }, // Ỉ -> I, ̉ (309)
+ { 0x1ec9, 0x69 }, // ỉ -> i, ̉ (309)
+ { 0x1eca, 0x49 }, // Ị -> I, ̣ (323)
+ { 0x1ecb, 0x69 }, // ị -> i, ̣ (323)
+ { 0x1ecc, 0x4f }, // Ọ -> O, ̣ (323)
+ { 0x1ecd, 0x6f }, // ọ -> o, ̣ (323)
+ { 0x1ece, 0x4f }, // Ỏ -> O, ̉ (309)
+ { 0x1ecf, 0x6f }, // ỏ -> o, ̉ (309)
+ { 0x1ed0, 0x4f }, // Ố -> O, ̂ (302), ́ (301)
+ { 0x1ed1, 0x6f }, // ố -> o, ̂ (302), ́ (301)
+ { 0x1ed2, 0x4f }, // Ồ -> O, ̂ (302), ̀ (300)
+ { 0x1ed3, 0x6f }, // ồ -> o, ̂ (302), ̀ (300)
+ { 0x1ed4, 0x4f }, // Ổ -> O, ̂ (302), ̉ (309)
+ { 0x1ed5, 0x6f }, // ổ -> o, ̂ (302), ̉ (309)
+ { 0x1ed6, 0x4f }, // Ỗ -> O, ̂ (302), ̃ (303)
+ { 0x1ed7, 0x6f }, // ỗ -> o, ̂ (302), ̃ (303)
+ { 0x1ed8, 0x4f }, // Ộ -> O, ̣ (323), ̂ (302)
+ { 0x1ed9, 0x6f }, // ộ -> o, ̣ (323), ̂ (302)
+ { 0x1eda, 0x4f }, // Ớ -> O, ̛ (31b), ́ (301)
+ { 0x1edb, 0x6f }, // ớ -> o, ̛ (31b), ́ (301)
+ { 0x1edc, 0x4f }, // Ờ -> O, ̛ (31b), ̀ (300)
+ { 0x1edd, 0x6f }, // ờ -> o, ̛ (31b), ̀ (300)
+ { 0x1ede, 0x4f }, // Ở -> O, ̛ (31b), ̉ (309)
+ { 0x1edf, 0x6f }, // ở -> o, ̛ (31b), ̉ (309)
+ { 0x1ee0, 0x4f }, // Ỡ -> O, ̛ (31b), ̃ (303)
+ { 0x1ee1, 0x6f }, // ỡ -> o, ̛ (31b), ̃ (303)
+ { 0x1ee2, 0x4f }, // Ợ -> O, ̛ (31b), ̣ (323)
+ { 0x1ee3, 0x6f }, // ợ -> o, ̛ (31b), ̣ (323)
+ { 0x1ee4, 0x55 }, // Ụ -> U, ̣ (323)
+ { 0x1ee5, 0x75 }, // ụ -> u, ̣ (323)
+ { 0x1ee6, 0x55 }, // Ủ -> U, ̉ (309)
+ { 0x1ee7, 0x75 }, // ủ -> u, ̉ (309)
+ { 0x1ee8, 0x55 }, // Ứ -> U, ̛ (31b), ́ (301)
+ { 0x1ee9, 0x75 }, // ứ -> u, ̛ (31b), ́ (301)
+ { 0x1eea, 0x55 }, // Ừ -> U, ̛ (31b), ̀ (300)
+ { 0x1eeb, 0x75 }, // ừ -> u, ̛ (31b), ̀ (300)
+ { 0x1eec, 0x55 }, // Ử -> U, ̛ (31b), ̉ (309)
+ { 0x1eed, 0x75 }, // ử -> u, ̛ (31b), ̉ (309)
+ { 0x1eee, 0x55 }, // Ữ -> U, ̛ (31b), ̃ (303)
+ { 0x1eef, 0x75 }, // ữ -> u, ̛ (31b), ̃ (303)
+ { 0x1ef0, 0x55 }, // Ự -> U, ̛ (31b), ̣ (323)
+ { 0x1ef1, 0x75 }, // ự -> u, ̛ (31b), ̣ (323)
+ { 0x1ef2, 0x59 }, // Ỳ -> Y, ̀ (300)
+ { 0x1ef3, 0x79 }, // ỳ -> y, ̀ (300)
+ { 0x1ef4, 0x59 }, // Ỵ -> Y, ̣ (323)
+ { 0x1ef5, 0x79 }, // ỵ -> y, ̣ (323)
+ { 0x1ef6, 0x59 }, // Ỷ -> Y, ̉ (309)
+ { 0x1ef7, 0x79 }, // ỷ -> y, ̉ (309)
+ { 0x1ef8, 0x59 }, // Ỹ -> Y, ̃ (303)
+ { 0x1ef9, 0x79 }, // ỹ -> y, ̃ (303)
+ { 0x1f00, 0x3b1 }, // ἀ -> α, ̓ (313)
+ { 0x1f01, 0x3b1 }, // ἁ -> α, ̔ (314)
+ { 0x1f02, 0x3b1 }, // ἂ -> α, ̓ (313), ̀ (300)
+ { 0x1f03, 0x3b1 }, // ἃ -> α, ̔ (314), ̀ (300)
+ { 0x1f04, 0x3b1 }, // ἄ -> α, ̓ (313), ́ (301)
+ { 0x1f05, 0x3b1 }, // ἅ -> α, ̔ (314), ́ (301)
+ { 0x1f06, 0x3b1 }, // ἆ -> α, ̓ (313), ͂ (342)
+ { 0x1f07, 0x3b1 }, // ἇ -> α, ̔ (314), ͂ (342)
+ { 0x1f08, 0x391 }, // Ἀ -> Α, ̓ (313)
+ { 0x1f09, 0x391 }, // Ἁ -> Α, ̔ (314)
+ { 0x1f0a, 0x391 }, // Ἂ -> Α, ̓ (313), ̀ (300)
+ { 0x1f0b, 0x391 }, // Ἃ -> Α, ̔ (314), ̀ (300)
+ { 0x1f0c, 0x391 }, // Ἄ -> Α, ̓ (313), ́ (301)
+ { 0x1f0d, 0x391 }, // Ἅ -> Α, ̔ (314), ́ (301)
+ { 0x1f0e, 0x391 }, // Ἆ -> Α, ̓ (313), ͂ (342)
+ { 0x1f0f, 0x391 }, // Ἇ -> Α, ̔ (314), ͂ (342)
+ { 0x1f10, 0x3b5 }, // ἐ -> ε, ̓ (313)
+ { 0x1f11, 0x3b5 }, // ἑ -> ε, ̔ (314)
+ { 0x1f12, 0x3b5 }, // ἒ -> ε, ̓ (313), ̀ (300)
+ { 0x1f13, 0x3b5 }, // ἓ -> ε, ̔ (314), ̀ (300)
+ { 0x1f14, 0x3b5 }, // ἔ -> ε, ̓ (313), ́ (301)
+ { 0x1f15, 0x3b5 }, // ἕ -> ε, ̔ (314), ́ (301)
+ { 0x1f18, 0x395 }, // Ἐ -> Ε, ̓ (313)
+ { 0x1f19, 0x395 }, // Ἑ -> Ε, ̔ (314)
+ { 0x1f1a, 0x395 }, // Ἒ -> Ε, ̓ (313), ̀ (300)
+ { 0x1f1b, 0x395 }, // Ἓ -> Ε, ̔ (314), ̀ (300)
+ { 0x1f1c, 0x395 }, // Ἔ -> Ε, ̓ (313), ́ (301)
+ { 0x1f1d, 0x395 }, // Ἕ -> Ε, ̔ (314), ́ (301)
+ { 0x1f20, 0x3b7 }, // ἠ -> η, ̓ (313)
+ { 0x1f21, 0x3b7 }, // ἡ -> η, ̔ (314)
+ { 0x1f22, 0x3b7 }, // ἢ -> η, ̓ (313), ̀ (300)
+ { 0x1f23, 0x3b7 }, // ἣ -> η, ̔ (314), ̀ (300)
+ { 0x1f24, 0x3b7 }, // ἤ -> η, ̓ (313), ́ (301)
+ { 0x1f25, 0x3b7 }, // ἥ -> η, ̔ (314), ́ (301)
+ { 0x1f26, 0x3b7 }, // ἦ -> η, ̓ (313), ͂ (342)
+ { 0x1f27, 0x3b7 }, // ἧ -> η, ̔ (314), ͂ (342)
+ { 0x1f28, 0x397 }, // Ἠ -> Η, ̓ (313)
+ { 0x1f29, 0x397 }, // Ἡ -> Η, ̔ (314)
+ { 0x1f2a, 0x397 }, // Ἢ -> Η, ̓ (313), ̀ (300)
+ { 0x1f2b, 0x397 }, // Ἣ -> Η, ̔ (314), ̀ (300)
+ { 0x1f2c, 0x397 }, // Ἤ -> Η, ̓ (313), ́ (301)
+ { 0x1f2d, 0x397 }, // Ἥ -> Η, ̔ (314), ́ (301)
+ { 0x1f2e, 0x397 }, // Ἦ -> Η, ̓ (313), ͂ (342)
+ { 0x1f2f, 0x397 }, // Ἧ -> Η, ̔ (314), ͂ (342)
+ { 0x1f30, 0x3b9 }, // ἰ -> ι, ̓ (313)
+ { 0x1f31, 0x3b9 }, // ἱ -> ι, ̔ (314)
+ { 0x1f32, 0x3b9 }, // ἲ -> ι, ̓ (313), ̀ (300)
+ { 0x1f33, 0x3b9 }, // ἳ -> ι, ̔ (314), ̀ (300)
+ { 0x1f34, 0x3b9 }, // ἴ -> ι, ̓ (313), ́ (301)
+ { 0x1f35, 0x3b9 }, // ἵ -> ι, ̔ (314), ́ (301)
+ { 0x1f36, 0x3b9 }, // ἶ -> ι, ̓ (313), ͂ (342)
+ { 0x1f37, 0x3b9 }, // ἷ -> ι, ̔ (314), ͂ (342)
+ { 0x1f38, 0x399 }, // Ἰ -> Ι, ̓ (313)
+ { 0x1f39, 0x399 }, // Ἱ -> Ι, ̔ (314)
+ { 0x1f3a, 0x399 }, // Ἲ -> Ι, ̓ (313), ̀ (300)
+ { 0x1f3b, 0x399 }, // Ἳ -> Ι, ̔ (314), ̀ (300)
+ { 0x1f3c, 0x399 }, // Ἴ -> Ι, ̓ (313), ́ (301)
+ { 0x1f3d, 0x399 }, // Ἵ -> Ι, ̔ (314), ́ (301)
+ { 0x1f3e, 0x399 }, // Ἶ -> Ι, ̓ (313), ͂ (342)
+ { 0x1f3f, 0x399 }, // Ἷ -> Ι, ̔ (314), ͂ (342)
+ { 0x1f40, 0x3bf }, // ὀ -> ο, ̓ (313)
+ { 0x1f41, 0x3bf }, // ὁ -> ο, ̔ (314)
+ { 0x1f42, 0x3bf }, // ὂ -> ο, ̓ (313), ̀ (300)
+ { 0x1f43, 0x3bf }, // ὃ -> ο, ̔ (314), ̀ (300)
+ { 0x1f44, 0x3bf }, // ὄ -> ο, ̓ (313), ́ (301)
+ { 0x1f45, 0x3bf }, // ὅ -> ο, ̔ (314), ́ (301)
+ { 0x1f48, 0x39f }, // Ὀ -> Ο, ̓ (313)
+ { 0x1f49, 0x39f }, // Ὁ -> Ο, ̔ (314)
+ { 0x1f4a, 0x39f }, // Ὂ -> Ο, ̓ (313), ̀ (300)
+ { 0x1f4b, 0x39f }, // Ὃ -> Ο, ̔ (314), ̀ (300)
+ { 0x1f4c, 0x39f }, // Ὄ -> Ο, ̓ (313), ́ (301)
+ { 0x1f4d, 0x39f }, // Ὅ -> Ο, ̔ (314), ́ (301)
+ { 0x1f50, 0x3c5 }, // ὐ -> υ, ̓ (313)
+ { 0x1f51, 0x3c5 }, // ὑ -> υ, ̔ (314)
+ { 0x1f52, 0x3c5 }, // ὒ -> υ, ̓ (313), ̀ (300)
+ { 0x1f53, 0x3c5 }, // ὓ -> υ, ̔ (314), ̀ (300)
+ { 0x1f54, 0x3c5 }, // ὔ -> υ, ̓ (313), ́ (301)
+ { 0x1f55, 0x3c5 }, // ὕ -> υ, ̔ (314), ́ (301)
+ { 0x1f56, 0x3c5 }, // ὖ -> υ, ̓ (313), ͂ (342)
+ { 0x1f57, 0x3c5 }, // ὗ -> υ, ̔ (314), ͂ (342)
+ { 0x1f59, 0x3a5 }, // Ὑ -> Υ, ̔ (314)
+ { 0x1f5b, 0x3a5 }, // Ὓ -> Υ, ̔ (314), ̀ (300)
+ { 0x1f5d, 0x3a5 }, // Ὕ -> Υ, ̔ (314), ́ (301)
+ { 0x1f5f, 0x3a5 }, // Ὗ -> Υ, ̔ (314), ͂ (342)
+ { 0x1f60, 0x3c9 }, // ὠ -> ω, ̓ (313)
+ { 0x1f61, 0x3c9 }, // ὡ -> ω, ̔ (314)
+ { 0x1f62, 0x3c9 }, // ὢ -> ω, ̓ (313), ̀ (300)
+ { 0x1f63, 0x3c9 }, // ὣ -> ω, ̔ (314), ̀ (300)
+ { 0x1f64, 0x3c9 }, // ὤ -> ω, ̓ (313), ́ (301)
+ { 0x1f65, 0x3c9 }, // ὥ -> ω, ̔ (314), ́ (301)
+ { 0x1f66, 0x3c9 }, // ὦ -> ω, ̓ (313), ͂ (342)
+ { 0x1f67, 0x3c9 }, // ὧ -> ω, ̔ (314), ͂ (342)
+ { 0x1f68, 0x3a9 }, // Ὠ -> Ω, ̓ (313)
+ { 0x1f69, 0x3a9 }, // Ὡ -> Ω, ̔ (314)
+ { 0x1f6a, 0x3a9 }, // Ὢ -> Ω, ̓ (313), ̀ (300)
+ { 0x1f6b, 0x3a9 }, // Ὣ -> Ω, ̔ (314), ̀ (300)
+ { 0x1f6c, 0x3a9 }, // Ὤ -> Ω, ̓ (313), ́ (301)
+ { 0x1f6d, 0x3a9 }, // Ὥ -> Ω, ̔ (314), ́ (301)
+ { 0x1f6e, 0x3a9 }, // Ὦ -> Ω, ̓ (313), ͂ (342)
+ { 0x1f6f, 0x3a9 }, // Ὧ -> Ω, ̔ (314), ͂ (342)
+ { 0x1f70, 0x3b1 }, // ὰ -> α, ̀ (300)
+ { 0x1f71, 0x3b1 }, // ά -> α, ́ (301)
+ { 0x1f72, 0x3b5 }, // ὲ -> ε, ̀ (300)
+ { 0x1f73, 0x3b5 }, // έ -> ε, ́ (301)
+ { 0x1f74, 0x3b7 }, // ὴ -> η, ̀ (300)
+ { 0x1f75, 0x3b7 }, // ή -> η, ́ (301)
+ { 0x1f76, 0x3b9 }, // ὶ -> ι, ̀ (300)
+ { 0x1f77, 0x3b9 }, // ί -> ι, ́ (301)
+ { 0x1f78, 0x3bf }, // ὸ -> ο, ̀ (300)
+ { 0x1f79, 0x3bf }, // ό -> ο, ́ (301)
+ { 0x1f7a, 0x3c5 }, // ὺ -> υ, ̀ (300)
+ { 0x1f7b, 0x3c5 }, // ύ -> υ, ́ (301)
+ { 0x1f7c, 0x3c9 }, // ὼ -> ω, ̀ (300)
+ { 0x1f7d, 0x3c9 }, // ώ -> ω, ́ (301)
+ { 0x1f80, 0x3b1 }, // ᾀ -> α, ̓ (313), ͅ (345)
+ { 0x1f81, 0x3b1 }, // ᾁ -> α, ̔ (314), ͅ (345)
+ { 0x1f82, 0x3b1 }, // ᾂ -> α, ̓ (313), ̀ (300), ͅ (345)
+ { 0x1f83, 0x3b1 }, // ᾃ -> α, ̔ (314), ̀ (300), ͅ (345)
+ { 0x1f84, 0x3b1 }, // ᾄ -> α, ̓ (313), ́ (301), ͅ (345)
+ { 0x1f85, 0x3b1 }, // ᾅ -> α, ̔ (314), ́ (301), ͅ (345)
+ { 0x1f86, 0x3b1 }, // ᾆ -> α, ̓ (313), ͂ (342), ͅ (345)
+ { 0x1f87, 0x3b1 }, // ᾇ -> α, ̔ (314), ͂ (342), ͅ (345)
+ { 0x1f88, 0x391 }, // ᾈ -> Α, ̓ (313), ͅ (345)
+ { 0x1f89, 0x391 }, // ᾉ -> Α, ̔ (314), ͅ (345)
+ { 0x1f8a, 0x391 }, // ᾊ -> Α, ̓ (313), ̀ (300), ͅ (345)
+ { 0x1f8b, 0x391 }, // ᾋ -> Α, ̔ (314), ̀ (300), ͅ (345)
+ { 0x1f8c, 0x391 }, // ᾌ -> Α, ̓ (313), ́ (301), ͅ (345)
+ { 0x1f8d, 0x391 }, // ᾍ -> Α, ̔ (314), ́ (301), ͅ (345)
+ { 0x1f8e, 0x391 }, // ᾎ -> Α, ̓ (313), ͂ (342), ͅ (345)
+ { 0x1f8f, 0x391 }, // ᾏ -> Α, ̔ (314), ͂ (342), ͅ (345)
+ { 0x1f90, 0x3b7 }, // ᾐ -> η, ̓ (313), ͅ (345)
+ { 0x1f91, 0x3b7 }, // ᾑ -> η, ̔ (314), ͅ (345)
+ { 0x1f92, 0x3b7 }, // ᾒ -> η, ̓ (313), ̀ (300), ͅ (345)
+ { 0x1f93, 0x3b7 }, // ᾓ -> η, ̔ (314), ̀ (300), ͅ (345)
+ { 0x1f94, 0x3b7 }, // ᾔ -> η, ̓ (313), ́ (301), ͅ (345)
+ { 0x1f95, 0x3b7 }, // ᾕ -> η, ̔ (314), ́ (301), ͅ (345)
+ { 0x1f96, 0x3b7 }, // ᾖ -> η, ̓ (313), ͂ (342), ͅ (345)
+ { 0x1f97, 0x3b7 }, // ᾗ -> η, ̔ (314), ͂ (342), ͅ (345)
+ { 0x1f98, 0x397 }, // ᾘ -> Η, ̓ (313), ͅ (345)
+ { 0x1f99, 0x397 }, // ᾙ -> Η, ̔ (314), ͅ (345)
+ { 0x1f9a, 0x397 }, // ᾚ -> Η, ̓ (313), ̀ (300), ͅ (345)
+ { 0x1f9b, 0x397 }, // ᾛ -> Η, ̔ (314), ̀ (300), ͅ (345)
+ { 0x1f9c, 0x397 }, // ᾜ -> Η, ̓ (313), ́ (301), ͅ (345)
+ { 0x1f9d, 0x397 }, // ᾝ -> Η, ̔ (314), ́ (301), ͅ (345)
+ { 0x1f9e, 0x397 }, // ᾞ -> Η, ̓ (313), ͂ (342), ͅ (345)
+ { 0x1f9f, 0x397 }, // ᾟ -> Η, ̔ (314), ͂ (342), ͅ (345)
+ { 0x1fa0, 0x3c9 }, // ᾠ -> ω, ̓ (313), ͅ (345)
+ { 0x1fa1, 0x3c9 }, // ᾡ -> ω, ̔ (314), ͅ (345)
+ { 0x1fa2, 0x3c9 }, // ᾢ -> ω, ̓ (313), ̀ (300), ͅ (345)
+ { 0x1fa3, 0x3c9 }, // ᾣ -> ω, ̔ (314), ̀ (300), ͅ (345)
+ { 0x1fa4, 0x3c9 }, // ᾤ -> ω, ̓ (313), ́ (301), ͅ (345)
+ { 0x1fa5, 0x3c9 }, // ᾥ -> ω, ̔ (314), ́ (301), ͅ (345)
+ { 0x1fa6, 0x3c9 }, // ᾦ -> ω, ̓ (313), ͂ (342), ͅ (345)
+ { 0x1fa7, 0x3c9 }, // ᾧ -> ω, ̔ (314), ͂ (342), ͅ (345)
+ { 0x1fa8, 0x3a9 }, // ᾨ -> Ω, ̓ (313), ͅ (345)
+ { 0x1fa9, 0x3a9 }, // ᾩ -> Ω, ̔ (314), ͅ (345)
+ { 0x1faa, 0x3a9 }, // ᾪ -> Ω, ̓ (313), ̀ (300), ͅ (345)
+ { 0x1fab, 0x3a9 }, // ᾫ -> Ω, ̔ (314), ̀ (300), ͅ (345)
+ { 0x1fac, 0x3a9 }, // ᾬ -> Ω, ̓ (313), ́ (301), ͅ (345)
+ { 0x1fad, 0x3a9 }, // ᾭ -> Ω, ̔ (314), ́ (301), ͅ (345)
+ { 0x1fae, 0x3a9 }, // ᾮ -> Ω, ̓ (313), ͂ (342), ͅ (345)
+ { 0x1faf, 0x3a9 }, // ᾯ -> Ω, ̔ (314), ͂ (342), ͅ (345)
+ { 0x1fb0, 0x3b1 }, // ᾰ -> α, ̆ (306)
+ { 0x1fb1, 0x3b1 }, // ᾱ -> α, ̄ (304)
+ { 0x1fb2, 0x3b1 }, // ᾲ -> α, ̀ (300), ͅ (345)
+ { 0x1fb3, 0x3b1 }, // ᾳ -> α, ͅ (345)
+ { 0x1fb4, 0x3b1 }, // ᾴ -> α, ́ (301), ͅ (345)
+ { 0x1fb6, 0x3b1 }, // ᾶ -> α, ͂ (342)
+ { 0x1fb7, 0x3b1 }, // ᾷ -> α, ͂ (342), ͅ (345)
+ { 0x1fb8, 0x391 }, // Ᾰ -> Α, ̆ (306)
+ { 0x1fb9, 0x391 }, // Ᾱ -> Α, ̄ (304)
+ { 0x1fba, 0x391 }, // Ὰ -> Α, ̀ (300)
+ { 0x1fbb, 0x391 }, // Ά -> Α, ́ (301)
+ { 0x1fbc, 0x391 }, // ᾼ -> Α, ͅ (345)
+ { 0x1fc2, 0x3b7 }, // ῂ -> η, ̀ (300), ͅ (345)
+ { 0x1fc3, 0x3b7 }, // ῃ -> η, ͅ (345)
+ { 0x1fc4, 0x3b7 }, // ῄ -> η, ́ (301), ͅ (345)
+ { 0x1fc6, 0x3b7 }, // ῆ -> η, ͂ (342)
+ { 0x1fc7, 0x3b7 }, // ῇ -> η, ͂ (342), ͅ (345)
+ { 0x1fc8, 0x395 }, // Ὲ -> Ε, ̀ (300)
+ { 0x1fc9, 0x395 }, // Έ -> Ε, ́ (301)
+ { 0x1fca, 0x397 }, // Ὴ -> Η, ̀ (300)
+ { 0x1fcb, 0x397 }, // Ή -> Η, ́ (301)
+ { 0x1fcc, 0x397 }, // ῌ -> Η, ͅ (345)
+ { 0x1fd0, 0x3b9 }, // ῐ -> ι, ̆ (306)
+ { 0x1fd1, 0x3b9 }, // ῑ -> ι, ̄ (304)
+ { 0x1fd2, 0x3b9 }, // ῒ -> ι, ̈ (308), ̀ (300)
+ { 0x1fd3, 0x3b9 }, // ΐ -> ι, ̈ (308), ́ (301)
+ { 0x1fd6, 0x3b9 }, // ῖ -> ι, ͂ (342)
+ { 0x1fd7, 0x3b9 }, // ῗ -> ι, ̈ (308), ͂ (342)
+ { 0x1fd8, 0x399 }, // Ῐ -> Ι, ̆ (306)
+ { 0x1fd9, 0x399 }, // Ῑ -> Ι, ̄ (304)
+ { 0x1fda, 0x399 }, // Ὶ -> Ι, ̀ (300)
+ { 0x1fdb, 0x399 }, // Ί -> Ι, ́ (301)
+ { 0x1fe0, 0x3c5 }, // ῠ -> υ, ̆ (306)
+ { 0x1fe1, 0x3c5 }, // ῡ -> υ, ̄ (304)
+ { 0x1fe2, 0x3c5 }, // ῢ -> υ, ̈ (308), ̀ (300)
+ { 0x1fe3, 0x3c5 }, // ΰ -> υ, ̈ (308), ́ (301)
+ { 0x1fe4, 0x3c1 }, // ῤ -> ρ, ̓ (313)
+ { 0x1fe5, 0x3c1 }, // ῥ -> ρ, ̔ (314)
+ { 0x1fe6, 0x3c5 }, // ῦ -> υ, ͂ (342)
+ { 0x1fe7, 0x3c5 }, // ῧ -> υ, ̈ (308), ͂ (342)
+ { 0x1fe8, 0x3a5 }, // Ῠ -> Υ, ̆ (306)
+ { 0x1fe9, 0x3a5 }, // Ῡ -> Υ, ̄ (304)
+ { 0x1fea, 0x3a5 }, // Ὺ -> Υ, ̀ (300)
+ { 0x1feb, 0x3a5 }, // Ύ -> Υ, ́ (301)
+ { 0x1fec, 0x3a1 }, // Ῥ -> Ρ, ̔ (314)
+ { 0x1ff2, 0x3c9 }, // ῲ -> ω, ̀ (300), ͅ (345)
+ { 0x1ff3, 0x3c9 }, // ῳ -> ω, ͅ (345)
+ { 0x1ff4, 0x3c9 }, // ῴ -> ω, ́ (301), ͅ (345)
+ { 0x1ff6, 0x3c9 }, // ῶ -> ω, ͂ (342)
+ { 0x1ff7, 0x3c9 }, // ῷ -> ω, ͂ (342), ͅ (345)
+ { 0x1ff8, 0x39f }, // Ὸ -> Ο, ̀ (300)
+ { 0x1ff9, 0x39f }, // Ό -> Ο, ́ (301)
+ { 0x1ffa, 0x3a9 }, // Ὼ -> Ω, ̀ (300)
+ { 0x1ffb, 0x3a9 }, // Ώ -> Ω, ́ (301)
+ { 0x1ffc, 0x3a9 }, // ῼ -> Ω, ͅ (345)
+ { 0x212b, 0x41 }, // Å -> A, ̊ (30a)
+ { 0x219a, 0x2190 }, // ↚ -> ←, ̸ (338)
+ { 0x219b, 0x2192 }, // ↛ -> →, ̸ (338)
+ { 0x21ae, 0x2194 }, // ↮ -> ↔, ̸ (338)
+ { 0x21cd, 0x21d0 }, // ⇍ -> ⇐, ̸ (338)
+ { 0x21ce, 0x21d4 }, // ⇎ -> ⇔, ̸ (338)
+ { 0x21cf, 0x21d2 }, // ⇏ -> ⇒, ̸ (338)
+ { 0x2204, 0x2203 }, // ∄ -> ∃, ̸ (338)
+ { 0x2209, 0x2208 }, // ∉ -> ∈, ̸ (338)
+ { 0x220c, 0x220b }, // ∌ -> ∋, ̸ (338)
+ { 0x2224, 0x2223 }, // ∤ -> ∣, ̸ (338)
+ { 0x2226, 0x2225 }, // ∦ -> ∥, ̸ (338)
+ { 0x2241, 0x223c }, // ≁ -> ∼, ̸ (338)
+ { 0x2244, 0x2243 }, // ≄ -> ≃, ̸ (338)
+ { 0x2247, 0x2245 }, // ≇ -> ≅, ̸ (338)
+ { 0x2249, 0x2248 }, // ≉ -> ≈, ̸ (338)
+ { 0x2260, 0x3d }, // ≠ -> =, ̸ (338)
+ { 0x2262, 0x2261 }, // ≢ -> ≡, ̸ (338)
+ { 0x226d, 0x224d }, // ≭ -> ≍, ̸ (338)
+ { 0x226e, 0x3c }, // ≮ -> <, ̸ (338)
+ { 0x226f, 0x3e }, // ≯ -> >, ̸ (338)
+ { 0x2270, 0x2264 }, // ≰ -> ≤, ̸ (338)
+ { 0x2271, 0x2265 }, // ≱ -> ≥, ̸ (338)
+ { 0x2274, 0x2272 }, // ≴ -> ≲, ̸ (338)
+ { 0x2275, 0x2273 }, // ≵ -> ≳, ̸ (338)
+ { 0x2278, 0x2276 }, // ≸ -> ≶, ̸ (338)
+ { 0x2279, 0x2277 }, // ≹ -> ≷, ̸ (338)
+ { 0x2280, 0x227a }, // ⊀ -> ≺, ̸ (338)
+ { 0x2281, 0x227b }, // ⊁ -> ≻, ̸ (338)
+ { 0x2284, 0x2282 }, // ⊄ -> ⊂, ̸ (338)
+ { 0x2285, 0x2283 }, // ⊅ -> ⊃, ̸ (338)
+ { 0x2288, 0x2286 }, // ⊈ -> ⊆, ̸ (338)
+ { 0x2289, 0x2287 }, // ⊉ -> ⊇, ̸ (338)
+ { 0x22ac, 0x22a2 }, // ⊬ -> ⊢, ̸ (338)
+ { 0x22ad, 0x22a8 }, // ⊭ -> ⊨, ̸ (338)
+ { 0x22ae, 0x22a9 }, // ⊮ -> ⊩, ̸ (338)
+ { 0x22af, 0x22ab }, // ⊯ -> ⊫, ̸ (338)
+ { 0x22e0, 0x227c }, // ⋠ -> ≼, ̸ (338)
+ { 0x22e1, 0x227d }, // ⋡ -> ≽, ̸ (338)
+ { 0x22e2, 0x2291 }, // ⋢ -> ⊑, ̸ (338)
+ { 0x22e3, 0x2292 }, // ⋣ -> ⊒, ̸ (338)
+ { 0x22ea, 0x22b2 }, // ⋪ -> ⊲, ̸ (338)
+ { 0x22eb, 0x22b3 }, // ⋫ -> ⊳, ̸ (338)
+ { 0x22ec, 0x22b4 }, // ⋬ -> ⊴, ̸ (338)
+ { 0x22ed, 0x22b5 }, // ⋭ -> ⊵, ̸ (338)
+ { 0x2adc, 0x2add }, // ⫝̸ -> ⫝, ̸ (338)
+};
--
1.7.2.3
Andrew Fuller
2010-12-23 15:24:43 UTC
Permalink
Post by Johannes Weißl
e.g. searching for Bjork finds Björk, or searching for
Trentemöller finds Trentemøller
Christmas came early this year! Cheers for this!

Andrew
Gregory Petrosyan
2010-12-24 00:12:56 UTC
Permalink
Post by Johannes Weißl
e.g. searching for Bjork finds Björk, or searching for
Trentemöller finds Trentemøller
Thanks a lot Johannes, I've merged this patchset to my local -pu and was going
to push it to gitorious, but I've encountered a crash:

u_collate.c:65:
free() is called on non-malloced buffer

This is because u_casefold() can return an gbuf_empty_buffer pointer for empty
input string.

Gregory
Johannes Weißl
2010-12-24 10:43:02 UTC
Permalink
* Replace u_strcasecmp by u_strcasecoll and u_strcase_equal, since
unicode strings can't be sorted locale-independent.
* Only use towlower() on systems where wchar_t is UCS-4.
---
Makefile | 2 +-
comment.c | 2 +-
glob.c | 2 +-
track_info.c | 17 ++---------
tree.c | 13 +++-----
u_collate.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++
u_collate.h | 51 ++++++++++++++++++++++++++++++++++
uchar.c | 85 +++++++++++++++++++++++++++++++++++++--------------------
uchar.h | 35 ++++++++++++++++++++++-
ui_curses.c | 1 +
10 files changed, 230 insertions(+), 57 deletions(-)
create mode 100644 u_collate.c
create mode 100644 u_collate.h

diff --git a/Makefile b/Makefile
index 53a3cc8..aa3f032 100644
--- a/Makefile
+++ b/Makefile
@@ -37,7 +37,7 @@ cmus-y := \
output.o pcm.o pl.o play_queue.o player.o \
rbtree.o read_wrapper.o server.o search.o \
search_mode.o spawn.o tabexp.o tabexp_file.o \
- track.o track_info.o tree.o uchar.o ui_curses.o \
+ track.o track_info.o tree.o u_collate.o uchar.o ui_curses.o \
window.o worker.o xstrjoin.o

$(cmus-y): CFLAGS += $(PTHREAD_CFLAGS) $(NCURSES_CFLAGS) $(ICONV_CFLAGS) $(DL_CFLAGS)
diff --git a/comment.c b/comment.c
index 042e930..a0e1674 100644
--- a/comment.c
+++ b/comment.c
@@ -29,7 +29,7 @@ int track_is_compilation(const struct keyval *comments)
return 1;

a = keyvals_get_val(comments, "artist");
- if (aa && a && u_strcasecmp(aa, a) != 0)
+ if (aa && a && !u_strcase_equal(aa, a))
return 1;

return 0;
diff --git a/glob.c b/glob.c
index ebca873..b4b0627 100644
--- a/glob.c
+++ b/glob.c
@@ -179,7 +179,7 @@ static int do_glob_match(struct list_head *head, struct list_head *first, const
if (gitem->type == GLOB_TEXT) {
int len = u_strlen(gitem->text);

- if (u_strncasecmp(gitem->text, text, len))
+ if (!u_strncase_equal(gitem->text, text, len))
return 0;
text += strlen(gitem->text);
} else if (gitem->type == GLOB_QMARK) {
diff --git a/track_info.c b/track_info.c
index 9190106..6a519c2 100644
--- a/track_info.c
+++ b/track_info.c
@@ -20,6 +20,7 @@
#include "track_info.h"
#include "comment.h"
#include "uchar.h"
+#include "u_collate.h"
#include "misc.h"
#include "xmalloc.h"
#include "utils.h"
@@ -113,18 +114,6 @@ int track_info_matches(struct track_info *ti, const char *text, unsigned int fla
return matched;
}

-static int xstrcasecmp(const char *a, const char *b)
-{
- if (a == NULL) {
- if (b == NULL)
- return 0;
- return -1;
- } else if (b == NULL) {
- return 1;
- }
- return u_strcasecmp(a, b);
-}
-
int track_info_cmp(const struct track_info *a, const struct track_info *b, const char * const *keys)
{
int i, res = 0;
@@ -158,7 +147,7 @@ int track_info_cmp(const struct track_info *a, const struct track_info *b, const
if (strcmp(key, "albumartist") == 0) {
av = comments_get_albumartist(a->comments);
bv = comments_get_albumartist(b->comments);
- res = xstrcasecmp(av, bv);
+ res = u_strcasecoll0(av, bv);
if (res)
break;
continue;
@@ -172,7 +161,7 @@ int track_info_cmp(const struct track_info *a, const struct track_info *b, const

av = keyvals_get_val(a->comments, key);
bv = keyvals_get_val(b->comments, key);
- res = xstrcasecmp(av, bv);
+ res = u_strcasecoll0(av, bv);
if (res)
break;
}
diff --git a/tree.c b/tree.c
index 3cf5ac6..aa53784 100644
--- a/tree.c
+++ b/tree.c
@@ -10,6 +10,7 @@
#include "debug.h"
#include "mergesort.h"
#include "options.h"
+#include "u_collate.h"

#include <ctype.h>
#include <stdio.h>
@@ -400,14 +401,10 @@ static void find_artist_and_album(const char *artist_name,
struct album *album;

list_for_each_entry(artist, &lib_artist_head, node) {
- int res;
-
- res = u_strcasecmp(artist->name, artist_name);
- if (res == 0) {
+ if (u_strcase_equal(artist->name, artist_name)) {
*_artist = artist;
list_for_each_entry(album, &artist->album_head, node) {
- res = u_strcasecmp(album->name, album_name);
- if (res == 0) {
+ if (u_strcase_equal(album->name, album_name)) {
*_album = album;
return;
}
@@ -428,7 +425,7 @@ static int special_name_cmp(const char *a, const char *b)

if (cmp)
return cmp;
- return u_strcasecmp(a, b);
+ return u_strcasecoll(a, b);
}

static int special_album_cmp(const struct album *a, const struct album *b)
@@ -442,7 +439,7 @@ static int special_album_cmp(const struct album *a, const struct album *b)
if (a->date != b->date)
return a->date - b->date;

- return u_strcasecmp(a->name, b->name);
+ return u_strcasecoll(a->name, b->name);
}

static void insert_artist(struct artist *artist)
diff --git a/u_collate.c b/u_collate.c
new file mode 100644
index 0000000..8dba7af
--- /dev/null
+++ b/u_collate.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2010 Johannes Weißl (based on gunicollate.c from glib)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include "u_collate.h"
+#include "uchar.h"
+#include "xmalloc.h"
+#include "ui_curses.h" /* using_utf8, charset */
+#include "convert.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+int u_strcoll(const char *str1, const char *str2)
+{
+ int result;
+
+ if (using_utf8) {
+ result = strcoll(str1, str2);
+ } else {
+ char *str1_locale, *str2_locale;
+
+ convert(str1, -1, &str1_locale, -1, charset, "UTF-8");
+ convert(str2, -1, &str2_locale, -1, charset, "UTF-8");
+
+ if (str1_locale && str2_locale)
+ result = strcoll(str1_locale, str2_locale);
+ else
+ result = strcmp(str1, str2);
+
+ if (str2_locale)
+ free(str2_locale);
+ if (str1_locale)
+ free(str1_locale);
+ }
+
+ return result;
+}
+
+int u_strcasecoll(const char *str1, const char *str2)
+{
+ char *cf_a, *cf_b;
+ int res;
+
+ cf_a = u_casefold(str1);
+ cf_b = u_casefold(str2);
+
+ res = u_strcoll(cf_a, cf_b);
+
+ free(cf_b);
+ free(cf_a);
+
+ return res;
+}
+
+int u_strcasecoll0(const char *str1, const char *str2)
+{
+ if (!str1)
+ return str2 ? -1 : 0;
+ if (!str2)
+ return 1;
+
+ return u_strcasecoll(str1, str2);
+}
diff --git a/u_collate.h b/u_collate.h
new file mode 100644
index 0000000..576df57
--- /dev/null
+++ b/u_collate.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2010 Johannes Weißl
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#ifndef U_COLLATE_H
+#define U_COLLATE_H
+
+/*
+ * @str1 valid, normalized, null-terminated UTF-8 string
+ * @str2 valid, normalized, null-terminated UTF-8 string
+ *
+ * Compares two strings for ordering using the linguistically
+ * correct rules for the current locale.
+ *
+ * Returns -1 if @str1 compares before @str2, 0 if they compare equal,
+ * +1 if @str1 compares after @str2.
+ */
+int u_strcoll(const char *str1, const char *str2);
+
+/*
+ * @str1 valid, normalized, null-terminated UTF-8 string
+ * @str2 valid, normalized, null-terminated UTF-8 string
+ *
+ * Like u_strcoll(), but do casefolding before comparing.
+ */
+int u_strcasecoll(const char *str1, const char *str2);
+
+/*
+ * @str1 valid, normalized, null-terminated UTF-8 string or NULL
+ * @str2 valid, normalized, null-terminated UTF-8 string or NULL
+ *
+ * Like u_strcasecoll(), but handle NULL pointers gracefully.
+ */
+int u_strcasecoll0(const char *str1, const char *str2);
+
+#endif
diff --git a/uchar.c b/uchar.c
index ca79d02..0ee1abc 100644
--- a/uchar.c
+++ b/uchar.c
@@ -19,6 +19,7 @@

#include "uchar.h"
#include "compiler.h"
+#include "gbuf.h"

#include <stdlib.h>
#include <string.h>
@@ -455,57 +456,81 @@ int u_skip_chars(const char *str, int *width)
}

/*
- * Comparison functions
+ * Case-folding functions
*/

-static inline int chcasecmp(int a, int b)
+static inline uchar u_casefold_char(uchar ch)
{
- return towupper(a) - towupper(b);
+ /* faster lookup for for A-Z, rest of ASCII unaffected */
+ if (ch < 0x0041)
+ return ch;
+ if (ch <= 0x005A)
+ return ch + 0x20;
+#ifdef __STDC_ISO_10646__
+ if (ch < 128)
+ return ch;
+ ch = towlower(ch);
+#endif
+ return ch;
}

-int u_strcasecmp(const char *a, const char *b)
+char *u_casefold(const char *str)
{
- int ai = 0;
- int bi = 0;
- int res;
+ GBUF(out);
+ int i = 0;

- do {
+ while (str[i]) {
+ char buf[4];
+ int buflen = 0;
+ uchar ch;
+
+ u_get_char(str, &i, &ch);
+ ch = u_casefold_char(ch);
+ u_set_char_raw(buf, &buflen, ch);
+ gbuf_add_bytes(&out, buf, buflen);
+ }
+
+ return gbuf_steal(&out);
+}
+
+/*
+ * Comparison functions
+ */
+
+int u_strcase_equal(const char *a, const char *b)
+{
+ int ai = 0, bi = 0;
+
+ while (a[ai]) {
uchar au, bu;

u_get_char(a, &ai, &au);
u_get_char(b, &bi, &bu);
- res = chcasecmp(au, bu);
- if (res)
- break;
- if (au == 0) {
- /* bu is 0 too */
- break;
- }
- } while (1);
- return res;
+
+ if (u_casefold_char(au) != u_casefold_char(bu))
+ return 0;
+ }
+
+ return b[bi] ? 0 : 1;
}

-int u_strncasecmp(const char *a, const char *b, int len)
+int u_strncase_equal(const char *a, const char *b, size_t len)
{
- int ai = 0;
- int bi = 0;
+ int ai = 0, bi = 0;

- while (len > 0) {
+ while (b[bi] && len > 0) {
uchar au, bu;
- int res;

u_get_char(a, &ai, &au);
u_get_char(b, &bi, &bu);
- res = chcasecmp(au, bu);
- if (res)
- return res;
- if (au == 0) {
- /* bu is 0 too */
+
+ if (u_casefold_char(au) != u_casefold_char(bu))
return 0;
- }
+
len--;
}
- return 0;
+
+ return 1;
}

char *u_strcasestr(const char *haystack, const char *needle)
@@ -520,7 +545,7 @@ char *u_strcasestr(const char *haystack, const char *needle)

if (haystack_len < needle_len)
return NULL;
- if (u_strncasecmp(needle, haystack, needle_len) == 0)
+ if (u_strncase_equal(needle, haystack, needle_len))
return (char *)haystack;

/* skip one char */
diff --git a/uchar.h b/uchar.h
index f6fb56d..21cd317 100644
--- a/uchar.h
+++ b/uchar.h
@@ -149,8 +149,39 @@ int u_copy_chars(char *dst, const char *src, int *width);
*/
int u_skip_chars(const char *str, int *width);

-int u_strcasecmp(const char *a, const char *b);
-int u_strncasecmp(const char *a, const char *b, int len);
+/*
+ * @str valid null-terminated UTF-8 string
+ *
+ * Converts a string into a form that is independent of case.
+ *
+ * Returns a newly allocated string
+ */
+char *u_casefold(const char *str);
+
+/*
+ * @str1 valid, normalized, null-terminated UTF-8 string
+ * @str2 valid, normalized, null-terminated UTF-8 string
+ *
+ * Returns 1 if @str1 is equal to @str2, ignoring the case of the characters.
+ */
+int u_strcase_equal(const char *str1, const char *str2);
+
+/*
+ * @str1 valid, normalized, null-terminated UTF-8 string
+ * @str2 valid, normalized, null-terminated UTF-8 string
+ * @len number of characters to consider for comparison
+ *
+ * Returns 1 if the first @len characters of @str1 and @str2 are equal,
+ * ignoring the case of the characters (0 otherwise).
+ */
+int u_strncase_equal(const char *str1, const char *str2, size_t len);
+
+/*
+ * @haystack valid, normalized, null-terminated UTF-8 string
+ * @needle valid, normalized, null-terminated UTF-8 string
+ *
+ * Returns position of @needle in @haystack (case insensitive comparison).
+ */
char *u_strcasestr(const char *haystack, const char *needle);

static inline char *u_strcasestr_filename(const char *haystack, const char *needle)
diff --git a/ui_curses.c b/ui_curses.c
index 0472aaa..fc36766 100644
--- a/ui_curses.c
+++ b/ui_curses.c
@@ -2245,6 +2245,7 @@ int main(int argc, char *argv[])
}

setlocale(LC_CTYPE, "");
+ setlocale(LC_COLLATE, "");
#ifdef CODESET
charset = nl_langinfo(CODESET);
#else
--
1.7.2.3
Gregory Petrosyan
2010-12-24 15:42:20 UTC
Permalink
Post by Johannes Weißl
+static struct {
+       uchar composed;
+       uchar base;
+} unidecomp_map[] = {
+       {   0xc0,   0x41 },     // À -> A,       ̀ (300)
+       {   0xc1,   0x41 },     // Á -> A,       ́ (301)
+       {   0xc2,   0x41 },     // Â -> A,       ̂ (302)
+       {   0xc3,   0x41 },     // Ã -> A,       ̃ (303)
+       {   0xc4,   0x41 },     // Ä -> A,       ̈ (308)
+       {   0xc5,   0x41 },     // Å -> A,       ̊ (30a)
BTW, if we'll ever fill extra need for speed, we can transform this
into an array without any holes, and replace binary search with a
plain ar

Gregory Petrosyan
2010-12-23 22:21:36 UTC
Permalink
Used to count members of static array.
- for (i = 0; i < sizeof(pl_mime_types) / sizeof(pl_mime_types[0]); i++) {
+ for (i = 0; i < N_ELEMENTS(pl_mime_types); i++) {
command_mode.c:2471 can be replaced as well :-)

Gregory
Johannes Weißl
2010-12-24 10:37:32 UTC
Permalink
Used to count members of static array.
---
command_mode.c | 2 +-
input.c | 2 +-
job.c | 2 +-
utils.h | 2 ++
4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/command_mode.c b/command_mode.c
index d1805f4..c236b6f 100644
--- a/command_mode.c
+++ b/command_mode.c
@@ -2468,7 +2468,7 @@ static void expand_commands(const char *str)
char **tails;

/* tabexp is resetted */
- tails = xnew(char *, sizeof(commands) / sizeof(struct command));
+ tails = xnew(char *, N_ELEMENTS(commands));
len = strlen(str);
pos = 0;
for (i = 0; commands[i].name; i++) {
diff --git a/input.c b/input.c
index 38b7911..12e8f23 100644
--- a/input.c
+++ b/input.c
@@ -353,7 +353,7 @@ static int open_remote(struct input_plugin *ip)
if (val) {
int i;

- for (i = 0; i < sizeof(pl_mime_types) / sizeof(pl_mime_types[0]); i++) {
+ for (i = 0; i < N_ELEMENTS(pl_mime_types); i++) {
if (!strcasecmp(val, pl_mime_types[i])) {
d_print("Content-Type: %s\n", val);
http_get_free(&hg);
diff --git a/job.c b/job.c
index 038c27f..a1b71f1 100644
--- a/job.c
+++ b/job.c
@@ -35,7 +35,7 @@ static void flush_ti_buffer(void)

static void add_ti(struct track_info *ti)
{
- if (ti_buffer_fill == sizeof(ti_buffer) / sizeof(ti_buffer[0]))
+ if (ti_buffer_fill == N_ELEMENTS(ti_buffer))
flush_ti_buffer();
ti_buffer[ti_buffer_fill++] = ti;
}
diff --git a/utils.h b/utils.h
index ad77d3f..aede09c 100644
--- a/utils.h
+++ b/utils.h
@@ -28,6 +28,8 @@
#include <time.h>
#include <inttypes.h>

+#define N_ELEMENTS(array) (sizeof(array) / sizeof((array)[0]))
+
static inline int min(int a, int b)
{
return a < b ? a : b;
--
1.7.2.3
Gregory Petrosyan
2010-12-23 22:43:06 UTC
Permalink
Post by Johannes Weißl
uchar.h | 28 ++++++++++++++--------------
Can you strip the rest of unnecessary externs in cmus (at least in function
declarations) in this patch as well please?

Gregory
Johannes Weißl
2010-12-24 10:41:42 UTC
Permalink
---
ape.h | 4 ++--
mergesort.h | 2 +-
rbtree.h | 16 ++++++++--------
read_wrapper.h | 2 +-
search_mode.h | 8 ++++----
track_info.h | 10 +++++-----
uchar.h | 28 ++++++++++++++--------------
window.h | 38 +++++++++++++++++++-------------------
xstrjoin.h | 2 +-
9 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/ape.h b/ape.h
index ecdb721..7a80ea5 100644
--- a/ape.h
+++ b/ape.h
@@ -37,8 +37,8 @@ struct apetag {

#define APETAG(name) struct apetag name = { .buf = NULL, .pos = 0, }

-extern int ape_read_tags(struct apetag *ape, int fd, int slow);
-extern char *ape_get_comment(struct apetag *ape, char **val);
+int ape_read_tags(struct apetag *ape, int fd, int slow);
+char *ape_get_comment(struct apetag *ape, char **val);

static inline void ape_free(struct apetag *ape)
{
diff --git a/mergesort.h b/mergesort.h
index f9a2c30..0b87be6 100644
--- a/mergesort.h
+++ b/mergesort.h
@@ -3,7 +3,7 @@

#include "list.h"

-extern void list_mergesort(struct list_head *head,
+void list_mergesort(struct list_head *head,
int (*compare)(const struct list_head *, const struct list_head *));

#endif
diff --git a/rbtree.h b/rbtree.h
index 11014b9..dfbb1ca 100644
--- a/rbtree.h
+++ b/rbtree.h
@@ -151,18 +151,18 @@ static inline void rb_set_color(struct rb_node *rb, int color)
#define RB_EMPTY_NODE(node) (rb_parent(node) == node)
#define RB_CLEAR_NODE(node) (rb_set_parent(node, node))

-extern void rb_insert_color(struct rb_node *, struct rb_root *);
-extern void rb_erase(struct rb_node *, struct rb_root *);
+void rb_insert_color(struct rb_node *, struct rb_root *);
+void rb_erase(struct rb_node *, struct rb_root *);

/* Find logical next and previous nodes in a tree */
-extern struct rb_node *rb_next(const struct rb_node *);
-extern struct rb_node *rb_prev(const struct rb_node *);
-extern struct rb_node *rb_first(const struct rb_root *);
-extern struct rb_node *rb_last(const struct rb_root *);
+struct rb_node *rb_next(const struct rb_node *);
+struct rb_node *rb_prev(const struct rb_node *);
+struct rb_node *rb_first(const struct rb_root *);
+struct rb_node *rb_last(const struct rb_root *);

/* Fast replacement of a single node without remove/rebalance/add/rebalance */
-extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
- struct rb_root *root);
+void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+ struct rb_root *root);

static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
struct rb_node ** rb_link)
diff --git a/read_wrapper.h b/read_wrapper.h
index 06764a5..c4c4313 100644
--- a/read_wrapper.h
+++ b/read_wrapper.h
@@ -24,6 +24,6 @@

#include <unistd.h>

-extern ssize_t read_wrapper(struct input_plugin_data *ip_data, void *buffer, size_t count);
+ssize_t read_wrapper(struct input_plugin_data *ip_data, void *buffer, size_t count);

#endif
diff --git a/search_mode.h b/search_mode.h
index 59548bb..d966c88 100644
--- a/search_mode.h
+++ b/search_mode.h
@@ -29,10 +29,10 @@ extern enum search_direction search_direction;
/* //WORDS or ??WORDS search mode */
extern int search_restricted;

-extern void search_mode_ch(uchar ch);
-extern void search_mode_key(int key);
-extern void search_mode_init(void);
-extern void search_mode_exit(void);
+void search_mode_ch(uchar ch);
+void search_mode_key(int key);
+void search_mode_init(void);
+void search_mode_exit(void);

void search_text(const char *text, int restricted, int beginning);

diff --git a/track_info.h b/track_info.h
index 6f4cf8d..8353940 100644
--- a/track_info.h
+++ b/track_info.h
@@ -40,16 +40,16 @@ struct track_info {
#define TI_MATCH_ALBUMARTIST (1 << 3)

/* initializes only filename and ref */
-extern struct track_info *track_info_new(const char *filename);
+struct track_info *track_info_new(const char *filename);

-extern void track_info_ref(struct track_info *ti);
-extern void track_info_unref(struct track_info *ti);
+void track_info_ref(struct track_info *ti);
+void track_info_unref(struct track_info *ti);

/*
* returns: 1 if @ti has any of the following tags: artist, album, title
* 0 otherwise
*/
-extern int track_info_has_tag(const struct track_info *ti);
+int track_info_has_tag(const struct track_info *ti);

/*
* @flags: TI_MATCH_*
@@ -57,7 +57,7 @@ extern int track_info_has_tag(const struct track_info *ti);
* returns: 1 if all words in @text are found to match defined fields (@flags) in @ti
* 0 otherwise
*/
-extern int track_info_matches(struct track_info *ti, const char *text, unsigned int flags);
+int track_info_matches(struct track_info *ti, const char *text, unsigned int flags);

int track_info_cmp(const struct track_info *a, const struct track_info *b, const char * const *keys);

diff --git a/uchar.h b/uchar.h
index 07870bd..e15f8bd 100644
--- a/uchar.h
+++ b/uchar.h
@@ -61,28 +61,28 @@ static inline int u_char_size(uchar uch)
/*
* Returns width of @uch (normally 1 or 2, 4 for invalid chars (<xx>))
*/
-extern int u_char_width(uchar uch);
+int u_char_width(uchar uch);

/*
* @str any null-terminated string
*
* Returns 1 if @str is valid UTF-8 string, 0 otherwise.
*/
-extern int u_is_valid(const char *str);
+int u_is_valid(const char *str);

/*
* @str null-terminated UTF-8 string
*
* Retuns length of @str in UTF-8 characters.
*/
-extern int u_strlen(const char *str);
+int u_strlen(const char *str);

/*
* @str null-terminated UTF-8 string
*
* Retuns width of @str.
*/
-extern int u_str_width(const char *str);
+int u_str_width(const char *str);

/*
* @str null-terminated UTF-8 string
@@ -90,24 +90,24 @@ extern int u_str_width(const char *str);
*
* Retuns width of the first @len characters in @str.
*/
-extern int u_str_nwidth(const char *str, int len);
+int u_str_nwidth(const char *str, int len);

-extern void u_prev_char_pos(const char *str, int *idx);
+void u_prev_char_pos(const char *str, int *idx);

/*
* @str null-terminated UTF-8 string
* @idx pointer to byte index in @str (not UTF-8 character index!)
* @uch pointer to returned unicode character
*/
-extern void u_get_char(const char *str, int *idx, uchar *uch);
+void u_get_char(const char *str, int *idx, uchar *uch);

/*
* @str destination buffer
* @idx pointer to byte index in @str (not UTF-8 character index!)
* @uch unicode character
*/
-extern void u_set_char_raw(char *str, int *idx, uchar uch);
-extern void u_set_char(char *str, int *idx, uchar uch);
+void u_set_char_raw(char *str, int *idx, uchar uch);
+void u_set_char(char *str, int *idx, uchar uch);

/*
* @dst destination buffer
@@ -120,7 +120,7 @@ extern void u_set_char(char *str, int *idx, uchar uch);
*
* Returns number of _bytes_ copied.
*/
-extern int u_copy_chars(char *dst, const char *src, int *width);
+int u_copy_chars(char *dst, const char *src, int *width);

/*
* @str null-terminated UTF-8 string, must be long enough
@@ -133,11 +133,11 @@ extern int u_copy_chars(char *dst, const char *src, int *width);
*
* Returns number of _bytes_ skipped.
*/
-extern int u_skip_chars(const char *str, int *width);
+int u_skip_chars(const char *str, int *width);

-extern int u_strcasecmp(const char *a, const char *b);
-extern int u_strncasecmp(const char *a, const char *b, int len);
-extern char *u_strcasestr(const char *haystack, const char *needle);
+int u_strcasecmp(const char *a, const char *b);
+int u_strncasecmp(const char *a, const char *b, int len);
+char *u_strcasestr(const char *haystack, const char *needle);

static inline char *u_strcasestr_filename(const char *haystack, const char *needle)
{
diff --git a/window.h b/window.h
index 72632af..a4670d7 100644
--- a/window.h
+++ b/window.h
@@ -58,10 +58,10 @@ struct window {
void (*sel_changed)(void);
};

-extern struct window *window_new(int (*get_prev)(struct iter *), int (*get_next)(struct iter *));
-extern void window_free(struct window *win);
-extern void window_set_empty(struct window *win);
-extern void window_set_contents(struct window *win, void *head);
+struct window *window_new(int (*get_prev)(struct iter *), int (*get_next)(struct iter *));
+void window_free(struct window *win);
+void window_set_empty(struct window *win);
+void window_set_contents(struct window *win, void *head);

/* call this after rows were added to window or order of rows was changed.
* top and sel MUST point to valid rows (or window must be empty, but then
@@ -70,27 +70,27 @@ extern void window_set_contents(struct window *win, void *head);
* if you remove row from window then call window_row_vanishes BEFORE removing
* the row instead of this function.
*/
-extern void window_changed(struct window *win);
+void window_changed(struct window *win);

/* call this BEFORE row is removed from window */
-extern void window_row_vanishes(struct window *win, struct iter *iter);
+void window_row_vanishes(struct window *win, struct iter *iter);

-extern int window_get_top(struct window *win, struct iter *iter);
-extern int window_get_sel(struct window *win, struct iter *iter);
-extern int window_get_prev(struct window *win, struct iter *iter);
-extern int window_get_next(struct window *win, struct iter *iter);
+int window_get_top(struct window *win, struct iter *iter);
+int window_get_sel(struct window *win, struct iter *iter);
+int window_get_prev(struct window *win, struct iter *iter);
+int window_get_next(struct window *win, struct iter *iter);

/* set selected row */
-extern void window_set_sel(struct window *win, struct iter *iter);
+void window_set_sel(struct window *win, struct iter *iter);

-extern void window_set_nr_rows(struct window *win, int nr_rows);
-extern void window_up(struct window *win, int rows);
-extern void window_down(struct window *win, int rows);
-extern void window_goto_top(struct window *win);
-extern void window_goto_bottom(struct window *win);
-extern void window_page_up(struct window *win);
-extern void window_page_down(struct window *win);
+void window_set_nr_rows(struct window *win, int nr_rows);
+void window_up(struct window *win, int rows);
+void window_down(struct window *win, int rows);
+void window_goto_top(struct window *win);
+void window_goto_bottom(struct window *win);
+void window_page_up(struct window *win);
+void window_page_down(struct window *win);

-extern int window_get_nr_rows(struct window *win);
+int window_get_nr_rows(struct window *win);

#endif
diff --git a/xstrjoin.h b/xstrjoin.h
index 44b3b33..c46a2d3 100644
--- a/xstrjoin.h
+++ b/xstrjoin.h
@@ -20,6 +20,6 @@
#ifndef _XSTRJOIN_H
#define _XSTRJOIN_H

-extern char *xstrjoin(const char *a, const char *b);
+char *xstrjoin(const char *a, const char *b);

#endif
--
1.7.2.3
Loading...