Discussion:
[PATCH 1/4] ui_curses.c: remove code duplication
Johannes Weißl
2011-01-26 06:28:25 UTC
Permalink
---
ui_curses.c | 26 +++++---------------------
1 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/ui_curses.c b/ui_curses.c
index fc36766..f78fdf1 100644
--- a/ui_curses.c
+++ b/ui_curses.c
@@ -1850,7 +1850,7 @@ static void u_getch(void)
if (bit == 7) {
/* ascii */
u = ch;
- } else {
+ } else if (using_utf8) {
int count;

u = ch & ((1 << bit) - 1);
@@ -1864,7 +1864,8 @@ static void u_getch(void)
u = (u << 6) | (ch & 63);
count--;
}
- }
+ } else
+ u = ch | U_INVALID_MASK;
handle_ch(u);
}

@@ -1973,25 +1974,8 @@ static void main_loop(void)
item = next;
}

- if (FD_ISSET(0, &set)) {
- if (using_utf8) {
- u_getch();
- } else {
- int key = getch();
-
- if (key != ERR && key != 0) {
- if (key > 255) {
- handle_key(key);
- } else {
- uchar ch = key;
-
- if (ch > 0x7f)
- ch |= U_INVALID_MASK;
- handle_ch(ch);
- }
- }
- }
- }
+ if (FD_ISSET(0, &set))
+ u_getch();
}
}
--
1.7.2.3
Johannes Weißl
2011-01-26 06:28:26 UTC
Permalink
cmdline is not in UTF-8 but in locale encoding, so all unicode functions
dealing with it must handle any arbitrary single byte encoding
gracefully. If not, cmdline editing is buggy when locale charmap !=
UTF-8.
---
cmdline.c | 4 ++--
command_mode.c | 4 ++--
uchar.c | 33 +++++++++++++++++++++++++++++++++
uchar.h | 13 ++++++++++---
4 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/cmdline.c b/cmdline.c
index e942b44..3f6afd8 100644
--- a/cmdline.c
+++ b/cmdline.c
@@ -90,7 +90,7 @@ void cmdline_set_text(const char *text)
cmdline.line = xrenew(char, cmdline.line, cmdline.size);
}
memcpy(cmdline.line, text, len + 1);
- cmdline.cpos = u_strlen(cmdline.line);
+ cmdline.cpos = u_strlen_safe(cmdline.line);
cmdline.bpos = len;
cmdline.clen = cmdline.cpos;
cmdline.blen = len;
@@ -109,7 +109,7 @@ void cmdline_clear_end(void)
{
cmdline.line[cmdline.bpos] = 0;

- cmdline.clen = u_strlen(cmdline.line);
+ cmdline.clen = u_strlen_safe(cmdline.line);
cmdline.blen = strlen(cmdline.line);
}

diff --git a/command_mode.c b/command_mode.c
index c236b6f..7b842e4 100644
--- a/command_mode.c
+++ b/command_mode.c
@@ -2603,8 +2603,8 @@ static void tab_expand(void)
}
sprintf(cmdline.line, "%s%s", tmp, s2);
cmdline.bpos = l1;
- cmdline.cpos = u_strlen(tmp);
- cmdline.clen = u_strlen(cmdline.line);
+ cmdline.cpos = u_strlen_safe(tmp);
+ cmdline.clen = u_strlen_safe(cmdline.line);
free(tmp);
}
free(s1);
diff --git a/uchar.c b/uchar.c
index 419d17e..f9b409c 100644
--- a/uchar.c
+++ b/uchar.c
@@ -142,6 +142,39 @@ size_t u_strlen(const char *str)
return len;
}

+size_t u_strlen_safe(const char *str)
+{
+ const unsigned char *s = (const unsigned char *)str;
+ size_t len = 0;
+
+ while (*s) {
+ int l = len_tab[*s];
+
+ if (unlikely(l > 1)) {
+ /* next l - 1 bytes must be 0x10xxxxxx */
+ int c = 1;
+ do {
+ if (len_tab[s[c]] != 0) {
+ /* invalid sequence */
+ goto single_char;
+ }
+ c++;
+ } while (c < l);
+
+ /* valid sequence */
+ s += l;
+ len++;
+ continue;
+ }
+single_char:
+ /* l is -1, 0 or 1
+ * invalid chars counted as single characters */
+ s++;
+ len++;
+ }
+ return len;
+}
+
int u_char_width(uchar u)
{
if (unlikely(u < 0x20))
diff --git a/uchar.h b/uchar.h
index fed36a5..6c3f526 100644
--- a/uchar.h
+++ b/uchar.h
@@ -73,10 +73,9 @@ int u_char_width(uchar uch);
int u_is_valid(const char *str);

/*
- * @str null-terminated UTF-8 string
+ * @str valid, null-terminated UTF-8 string
*
* Returns position of next unicode character in @str.
- * (fast and fault-tolerant)
*/
extern const char * const utf8_skip;
static inline char *u_next_char(const char *str)
@@ -85,7 +84,7 @@ static inline char *u_next_char(const char *str)
}

/*
- * @str null-terminated UTF-8 string
+ * @str valid, null-terminated UTF-8 string
*
* Retuns length of @str in UTF-8 characters.
*/
@@ -94,6 +93,14 @@ size_t u_strlen(const char *str);
/*
* @str null-terminated UTF-8 string
*
+ * Retuns length of @str in UTF-8 characters.
+ * Invalid chars are counted as single characters.
+ */
+size_t u_strlen_safe(const char *str);
+
+/*
+ * @str null-terminated UTF-8 string
+ *
* Retuns width of @str.
*/
int u_str_width(const char *str);
--
1.7.2.3
Johannes Weißl
2011-01-26 06:28:27 UTC
Permalink
cmdline is in locale encoding, tags are always in UTF-8. Without this
patch searching and filtering for non-ASCII characters is not possible
when locale charmap != UTF-8.

Also fixes small memory leak in expr_parse().
---
expr.c | 28 +++++++++++++++++-----------
search.c | 18 +++++++++++++++++-
ui_curses.c | 10 +++++++++-
3 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/expr.c b/expr.c
index f54c68c..36a5d50 100644
--- a/expr.c
+++ b/expr.c
@@ -11,6 +11,8 @@
#include "utils.h"
#include "debug.h"
#include "list.h"
+#include "ui_curses.h" /* using_utf8, charset */
+#include "convert.h"

#include <stdio.h>
#include <stdlib.h>
@@ -700,37 +702,41 @@ struct expr *expr_parse(const char *str)
LIST_HEAD(head);
struct expr *root = NULL;
struct list_head *item;
- char *new_str = NULL;
+ char *long_str = NULL, *u_str = NULL;
int i;

for (i = 0; str[i]; i++) {
unsigned char c = str[i];
if (c < 0x20) {
set_error("filter contains control characters");
- return NULL;
+ goto out;
}
}
+ if (!using_utf8 && utf8_encode(str, charset, &u_str) == 0) {
+ str = u_str;
+ }
if (!u_is_valid(str)) {
set_error("invalid UTF-8");
- return NULL;
+ goto out;
}

if (is_short_filter(str)) {
- new_str = expand_short_expr(str);
- if (!new_str)
- return NULL;
+ str = long_str = expand_short_expr(str);
+ if (!str)
+ goto out;
}

- if (tokenize(&head, new_str ? new_str : str))
- return NULL;
-
- if (new_str)
- free(new_str);
+ if (tokenize(&head, str))
+ goto out;

item = head.next;
if (parse(&root, &head, &item, 0))
root = NULL;
free_tokens(&head);
+
+out:
+ free(u_str);
+ free(long_str);
return root;
}

diff --git a/search.c b/search.c
index f8925e7..53400a0 100644
--- a/search.c
+++ b/search.c
@@ -6,6 +6,7 @@
#include "editable.h"
#include "xmalloc.h"
#include "ui_curses.h"
+#include "convert.h"

struct searchable {
void *data;
@@ -24,7 +25,7 @@ static void search_unlock(void)
}

/* returns next matching track (can be current!) or NULL if not found */
-static int do_search(struct searchable *s, struct iter *iter, const char *text, int direction)
+static int do_u_search(struct searchable *s, struct iter *iter, const char *text, int direction)
{
struct iter start = *iter;
const char *msg = NULL;
@@ -56,6 +57,21 @@ static int do_search(struct searchable *s, struct iter *iter, const char *text,
}
}

+static int do_search(struct searchable *s, struct iter *iter, const char *text, int direction)
+{
+ char *u_text = NULL;
+ int r;
+
+ /* search text is always in locale encoding (because cmdline is) */
+ if (!using_utf8 && utf8_encode(text, charset, &u_text) == 0)
+ text = u_text;
+
+ r = do_u_search(s, iter, text, direction);
+
+ free(u_text);
+ return r;
+}
+
struct searchable *searchable_new(void *data, const struct iter *head, const struct searchable_ops *ops)
{
struct searchable *s;
diff --git a/ui_curses.c b/ui_curses.c
index f78fdf1..a726859 100644
--- a/ui_curses.c
+++ b/ui_curses.c
@@ -653,6 +653,7 @@ static void print_filter(struct window *win, int row, struct iter *iter)
int current = !!e->act_stat;
const char stat_chars[3] = " *!";
int ch1, ch2, ch3, pos;
+ const char *e_filter;

window_get_sel(win, &sel);
selected = iters_equal(iter, &sel);
@@ -670,7 +671,14 @@ static void print_filter(struct window *win, int row, struct iter *iter)
ch3 = ']';
}
ch2 = stat_chars[e->sel_stat];
- snprintf(buf, sizeof(buf), "%c%c%c%-15s %s", ch1, ch2, ch3, e->name, e->filter);
+
+ e_filter = e->filter;
+ if (!using_utf8) {
+ utf8_encode(e_filter);
+ e_filter = conv_buffer;
+ }
+
+ snprintf(buf, sizeof(buf), "%c%c%c%-15s %s", ch1, ch2, ch3, e->name, e_filter);
pos = format_str(print_buffer, buf, COLS - 1);
print_buffer[pos++] = ' ';
print_buffer[pos] = 0;
--
1.7.2.3
Johannes Weißl
2011-01-26 06:28:28 UTC
Permalink
Filename is in locale encoding, the search term in UTF-8. This patch
enables searching for non-ASCII characters in filename (used when no
track title is available) if locale charmap != UTF-8.
---
uchar.c | 12 ++++++++++++
uchar.h | 11 +++++++----
2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/uchar.c b/uchar.c
index f9b409c..ade6f86 100644
--- a/uchar.c
+++ b/uchar.c
@@ -21,6 +21,8 @@
#include "compiler.h"
#include "gbuf.h"
#include "utils.h" /* N_ELEMENTS */
+#include "ui_curses.h" /* using_utf8, charset */
+#include "convert.h"

#include <stdlib.h>
#include <string.h>
@@ -638,3 +640,13 @@ char *u_strcasestr_base(const char *haystack, const char *needle)
{
return do_u_strcasestr(haystack, needle, 1);
}
+
+char *u_strcasestr_filename(const char *haystack, const char *needle)
+{
+ char *r = NULL, *ustr = NULL;
+ if (!using_utf8 && utf8_encode(haystack, charset, &ustr) == 0)
+ haystack = ustr;
+ r = u_strcasestr_base(haystack, needle);
+ free(ustr);
+ return r;
+}
diff --git a/uchar.h b/uchar.h
index 6c3f526..e12b9d2 100644
--- a/uchar.h
+++ b/uchar.h
@@ -210,9 +210,12 @@ char *u_strcasestr(const char *haystack, const char *needle);
*/
char *u_strcasestr_base(const char *haystack, const char *needle);

-static inline char *u_strcasestr_filename(const char *haystack, const char *needle)
-{
- return u_strcasestr(haystack, needle);
-}
+/*
+ * @haystack null-terminated string in local encoding
+ * @needle valid, normalized, null-terminated UTF-8 string
+ *
+ * Like u_strcasestr_base(), but converts @haystack to UTF-8 if necessary.
+ */
+char *u_strcasestr_filename(const char *haystack, const char *needle);

#endif
--
1.7.2.3
Gregory Petrosyan
2011-01-26 20:36:42 UTC
Permalink
Post by Johannes Weißl
Filename is in locale encoding, the search term in UTF-8. This patch
enables searching for non-ASCII characters in filename (used when no
track title is available) if locale charmap != UTF-8.
Patches like this (2, 3, 4) look like they should go to -maint, while
1 is for master, am I right?

                Gregory
Johannes Weißl
2011-01-26 22:42:37 UTC
Permalink
Post by Gregory Petrosyan
Post by Johannes Weißl
Filename is in locale encoding, the search term in UTF-8. This patch
enables searching for non-ASCII characters in filename (used when no
track title is available) if locale charmap != UTF-8.
Patches like this (2, 3, 4) look like they should go to -maint, while
1 is for master, am I right?
1 is just cleanup, so master is ok, but 2 fixes a problem which is only
in master. 3+4 could go to maint, but they won't apply since I wrote
them for master. I can re-release them as maint versions, if you like!


Johannes
Gregory Petrosyan
2011-01-30 13:34:20 UTC
Permalink
Post by Johannes Weißl
Post by Gregory Petrosyan
Post by Johannes Weißl
Filename is in locale encoding, the search term in UTF-8. This patch
enables searching for non-ASCII characters in filename (used when no
track title is available) if locale charmap != UTF-8.
Patches like this (2, 3, 4) look like they should go to -maint, while
1 is for master, am I right?
1 is just cleanup, so master is ok, but 2 fixes a problem which is only
them for master. I can re-release them as maint versions, if you like!
Please do! (Sorry for not replying for so long).

Gregory
Johannes Weißl
2011-01-30 16:21:23 UTC
Permalink
Post by Gregory Petrosyan
Post by Johannes Weißl
1 is just cleanup, so master is ok, but 2 fixes a problem which is only
them for master. I can re-release them as maint versions, if you like!
Please do! (Sorry for not replying for so long).
Ok, but I need to backport commit 3255c9f (make using_utf8 and charset
global) to make it work. Also, patch 3 and 4 should be committed to
master too, since merging these patches from maint back to master won't
work...


Johannes
Johannes Weißl
2011-01-30 16:35:13 UTC
Permalink
---
ui_curses.c | 12 +++++-------
ui_curses.h | 3 +++
2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/ui_curses.c b/ui_curses.c
index 145e42d..5a5675a 100644
--- a/ui_curses.c
+++ b/ui_curses.c
@@ -81,6 +81,9 @@ int cur_view = TREE_VIEW;
struct searchable *searchable;
char *lib_filename = NULL;
char *pl_filename = NULL;
+char *charset = NULL;
+int using_utf8 = 0;
+

/* ------------------------------------------------------------------------- */

@@ -98,7 +101,6 @@ static int error_count = 0;

static char *server_address = NULL;

-static char *charset = NULL;
static char print_buffer[512];

/* destination buffer for utf8_encode and utf8_decode */
@@ -107,8 +109,6 @@ static char conv_buffer[512];
/* one character can take up to 4 bytes in UTF-8 */
#define print_buffer_max_width (sizeof(print_buffer) / 4 - 1)

-static int using_utf8;
-
static char tcap_buffer[64];
static const char *t_ts;
static const char *t_fs;
@@ -2222,11 +2222,9 @@ int main(int argc, char *argv[])
#else
charset = "ISO-8859-1";
#endif
- if (strcmp(charset, "UTF-8") == 0) {
+ if (strcmp(charset, "UTF-8") == 0)
using_utf8 = 1;
- } else {
- using_utf8 = 0;
- }
+
misc_init();
if (server_address == NULL)
server_address = xstrjoin(cmus_config_dir, "/socket");
diff --git a/ui_curses.h b/ui_curses.h
index 88762e5..75bb3a8 100644
--- a/ui_curses.h
+++ b/ui_curses.h
@@ -38,6 +38,9 @@ extern struct searchable *searchable;
extern char *lib_filename;
extern char *pl_filename;

+extern char *charset;
+extern int using_utf8;
+
void update_titleline(void);
void update_statusline(void);
void update_colors(void);
--
1.7.2.3
Johannes Weißl
2011-01-30 16:35:14 UTC
Permalink
cmdline is in locale encoding, tags are always in UTF-8. Without this
patch searching and filtering for non-ASCII characters is not possible
when locale charmap != UTF-8.

Also fixes small memory leak in expr_parse().
---
expr.c | 15 ++++++++++++---
search.c | 19 ++++++++++++++++++-
ui_curses.c | 10 +++++++++-
3 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/expr.c b/expr.c
index 473cde4..4dc3ef5 100644
--- a/expr.c
+++ b/expr.c
@@ -11,6 +11,8 @@
#include "utils.h"
#include "debug.h"
#include "list.h"
+#include "ui_curses.h" /* using_utf8, charset */
+#include "utf8_encode.h"

#include <stdio.h>
#include <stdlib.h>
@@ -387,27 +389,34 @@ struct expr *expr_parse(const char *str)
LIST_HEAD(head);
struct expr *root = NULL;
struct list_head *item;
+ char *u_str = NULL;
int i;

for (i = 0; str[i]; i++) {
unsigned char c = str[i];
if (c < 0x20) {
set_error("filter contains control characters");
- return NULL;
+ goto out;
}
}
+ if (!using_utf8 && utf8_encode(str, charset, &u_str) == 0) {
+ str = u_str;
+ }
if (!u_is_valid(str)) {
set_error("invalid UTF-8");
- return NULL;
+ goto out;
}

if (tokenize(&head, str))
- return NULL;
+ goto out;

item = head.next;
if (parse(&root, &head, &item, 0))
root = NULL;
free_tokens(&head);
+
+out:
+ free(u_str);
return root;
}

diff --git a/search.c b/search.c
index 9efb6cd..d9913e5 100644
--- a/search.c
+++ b/search.c
@@ -5,6 +5,8 @@
#include "search.h"
#include "editable.h"
#include "xmalloc.h"
+#include "ui_curses.h"
+#include "utf8_encode.h"

struct searchable {
void *data;
@@ -23,7 +25,7 @@ static void search_unlock(void)
}

/* returns next matching track (can be current!) or NULL if not found */
-static int do_search(struct searchable *s, struct iter *iter, const char *text, int direction)
+static int do_u_search(struct searchable *s, struct iter *iter, const char *text, int direction)
{
while (1) {
if (s->ops.matches(s->data, iter, text))
@@ -38,6 +40,21 @@ static int do_search(struct searchable *s, struct iter *iter, const char *text,
}
}

+static int do_search(struct searchable *s, struct iter *iter, const char *text, int direction)
+{
+ char *u_text = NULL;
+ int r;
+
+ /* search text is always in locale encoding (because cmdline is) */
+ if (!using_utf8 && utf8_encode(text, charset, &u_text) == 0)
+ text = u_text;
+
+ r = do_u_search(s, iter, text, direction);
+
+ free(u_text);
+ return r;
+}
+
struct searchable *searchable_new(void *data, const struct iter *head, const struct searchable_ops *ops)
{
struct searchable *s;
diff --git a/ui_curses.c b/ui_curses.c
index 5a5675a..aa0cc1e 100644
--- a/ui_curses.c
+++ b/ui_curses.c
@@ -645,6 +645,7 @@ static void print_filter(struct window *win, int row, struct iter *iter)
int current = !!e->act_stat;
const char stat_chars[3] = " *!";
int ch1, ch2, ch3, pos;
+ const char *e_filter;

window_get_sel(win, &sel);
selected = iters_equal(iter, &sel);
@@ -662,7 +663,14 @@ static void print_filter(struct window *win, int row, struct iter *iter)
ch3 = ']';
}
ch2 = stat_chars[e->sel_stat];
- snprintf(buf, sizeof(buf), "%c%c%c%-15s %s", ch1, ch2, ch3, e->name, e->filter);
+
+ e_filter = e->filter;
+ if (!using_utf8) {
+ utf8_encode(e_filter);
+ e_filter = conv_buffer;
+ }
+
+ snprintf(buf, sizeof(buf), "%c%c%c%-15s %s", ch1, ch2, ch3, e->name, e_filter);
pos = format_str(print_buffer, buf, COLS - 1);
print_buffer[pos++] = ' ';
print_buffer[pos] = 0;
--
1.7.2.3
Johannes Weißl
2011-01-30 16:35:15 UTC
Permalink
Filename is in locale encoding, the search term in UTF-8. This patch
enables searching for non-ASCII characters in filename (used when no
track title is available) if locale charmap != UTF-8.
---
uchar.c | 12 ++++++++++++
uchar.h | 11 +++++++----
2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/uchar.c b/uchar.c
index 6f342b0..ff66063 100644
--- a/uchar.c
+++ b/uchar.c
@@ -19,6 +19,8 @@

#include "uchar.h"
#include "compiler.h"
+#include "ui_curses.h" /* using_utf8, charset */
+#include "utf8_encode.h"

#include <stdlib.h>
#include <string.h>
@@ -542,3 +544,13 @@ char *u_strcasestr(const char *haystack, const char *needle)
haystack_len -= idx;
} while (1);
}
+
+char *u_strcasestr_filename(const char *haystack, const char *needle)
+{
+ char *r = NULL, *ustr = NULL;
+ if (!using_utf8 && utf8_encode(haystack, charset, &ustr) == 0)
+ haystack = ustr;
+ r = u_strcasestr(haystack, needle);
+ free(ustr);
+ return r;
+}
diff --git a/uchar.h b/uchar.h
index 07870bd..4b93812 100644
--- a/uchar.h
+++ b/uchar.h
@@ -139,9 +139,12 @@ extern int u_strcasecmp(const char *a, const char *b);
extern int u_strncasecmp(const char *a, const char *b, int len);
extern char *u_strcasestr(const char *haystack, const char *needle);

-static inline char *u_strcasestr_filename(const char *haystack, const char *needle)
-{
- return u_strcasestr(haystack, needle);
-}
+/*
+ * @haystack null-terminated string in local encoding
+ * @needle valid, normalized, null-terminated UTF-8 string
+ *
+ * Like u_strcasestr_base(), but converts @haystack to UTF-8 if necessary.
+ */
+char *u_strcasestr_filename(const char *haystack, const char *needle);

#endif
--
1.7.2.3
Gregory Petrosyan
2011-02-06 14:19:03 UTC
Permalink
Pushed all 4 patches to master, and 3 rebased ones to maint, thanks Johannes!

Gregory
Johannes Weißl
2011-02-08 00:10:11 UTC
Permalink
Post by Gregory Petrosyan
Pushed all 4 patches to master, and 3 rebased ones to maint, thanks Johannes!
Thanks! I've tested it again, and nearly everything works now! I found a
small corner-case that needs one further patch: In expr.c the filename
is not converted to UTF-8 when matching (u_strcasestr_filename() can't
be used since glob.c doesn't know anything about the type of the
string). I attached the patch!

Johannes
Gregory Petrosyan
2011-02-10 15:24:19 UTC
Permalink
Post by Johannes Weißl
Post by Gregory Petrosyan
Pushed all 4 patches to master, and 3 rebased ones to maint, thanks Johannes!
Thanks! I've tested it again, and nearly everything works now! I found a
small corner-case that needs one further patch: In expr.c the filename
is not converted to UTF-8 when matching (u_strcasestr_filename() can't
be used since glob.c doesn't know anything about the type of the
string). I attached the patch!
Thanks, merged!

Gregory

Loading...