summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRoman Artiukhin <bahusdrive@gmail.com>2024-12-11 16:00:55 +0200
committerRoman Artiukhin <bahusdrive@gmail.com>2024-12-13 17:38:05 +0200
commita23002cd5e8dc555cd9d8f5c5df46839614bbe83 (patch)
tree2879f618ac9a3a670639d868f47ea2cc5ffad6f4
parent11fbbc78262b1ec8498cf11a307a9233274ddcfe (diff)
downloadrockbox-a23002cd5e.tar.gz
rockbox-a23002cd5e.zip
unicode: Unify check for UTF-16 BOM
Adds utf16_has_bom function Change-Id: I67ea474c9cf6ca6e6684351c2f54131164b7903c
-rw-r--r--apps/cuesheet.c15
-rw-r--r--firmware/common/unicode.c22
-rw-r--r--firmware/include/rbunicode.h2
-rw-r--r--lib/rbcodec/metadata/id3tags.c37
4 files changed, 43 insertions, 33 deletions
diff --git a/apps/cuesheet.c b/apps/cuesheet.c
index 2fe24d0a40..227ef5902c 100644
--- a/apps/cuesheet.c
+++ b/apps/cuesheet.c
@@ -231,15 +231,14 @@ bool parse_cuesheet(struct cuesheet_file *cue_file, struct cuesheet *cue)
char_enc = CHAR_ENC_UTF_8;
bom_read = BOM_UTF_8_SIZE;
}
- else if(!memcmp(line, BOM_UTF_16_LE, BOM_UTF_16_SIZE))
- {
- char_enc = CHAR_ENC_UTF_16_LE;
- bom_read = BOM_UTF_16_SIZE;
- }
- else if(!memcmp(line, BOM_UTF_16_BE, BOM_UTF_16_SIZE))
+ else
{
- char_enc = CHAR_ENC_UTF_16_BE;
- bom_read = BOM_UTF_16_SIZE;
+ bool le;
+ if (utf16_has_bom(line, &le))
+ {
+ char_enc = le ? CHAR_ENC_UTF_16_LE : CHAR_ENC_UTF_16_BE;
+ bom_read = BOM_UTF_16_SIZE;
+ }
}
}
diff --git a/firmware/common/unicode.c b/firmware/common/unicode.c
index 1ed2e5e49d..1e719c56eb 100644
--- a/firmware/common/unicode.c
+++ b/firmware/common/unicode.c
@@ -437,6 +437,28 @@ unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8,
return utf8;
}
+bool utf16_has_bom(const unsigned char *utf16, bool *le)
+{
+ unsigned long ucs = utf16[0] << 8 | utf16[1];
+
+ if (ucs == 0xFEFF) /* Check for BOM */
+ {
+ *le = false;
+ return true;
+ }
+
+ if (ucs == 0xFFFE)
+ {
+ *le = true;
+ return true;
+ }
+
+ /* If there is no BOM let's try to guess it. If one of the bytes is 0x00, it is
+ probably the most significant one. */
+ *le = utf16[1] == 0;
+ return false;
+}
+
#if 0 /* currently unused */
/* Recode any UTF-16 string to UTF-8 */
unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
diff --git a/firmware/include/rbunicode.h b/firmware/include/rbunicode.h
index 1af560abfe..6dae7f169f 100644
--- a/firmware/include/rbunicode.h
+++ b/firmware/include/rbunicode.h
@@ -29,6 +29,7 @@
#define _RBUNICODE_H_
#include "config.h"
+#include <stdbool.h>
#define MASK 0xC0 /* 11000000 */
#define COMP 0x80 /* 10x */
@@ -58,6 +59,7 @@ unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8);
unsigned char* iso_decode(const unsigned char *latin1, unsigned char *utf8, int cp, int count);
unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8, int count);
unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8, int count);
+bool utf16_has_bom(const unsigned char *utf16, bool *le);
unsigned long utf8length(const unsigned char *utf8);
const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs);
void set_codepage(int cp);
diff --git a/lib/rbcodec/metadata/id3tags.c b/lib/rbcodec/metadata/id3tags.c
index 588fdbef56..7006f5e739 100644
--- a/lib/rbcodec/metadata/id3tags.c
+++ b/lib/rbcodec/metadata/id3tags.c
@@ -570,8 +570,6 @@ static bool parse_as_utf8(char* string, int *len)
string. If it is, we convert it to a UTF-8 string. If it's not unicode,
we convert from the default codepage */
static void unicode_munge(char* string, char* utf8buf, int *len) {
- long tmp;
- bool le = false;
int i = 0;
unsigned char *str = (unsigned char *)string;
int templen = 0;
@@ -590,28 +588,17 @@ static void unicode_munge(char* string, char* utf8buf, int *len) {
case 0x02:
(*len)--;
str++;
+ bool le;
+
/* Handle frames with more than one string
(needed for TXXX frames).*/
do {
- tmp = bytes2int(0, 0, str[0], str[1]);
-
- /* Now check if there is a BOM
- (zero-width non-breaking space, 0xfeff)
- and if it is in little or big endian format */
- if(tmp == 0xfffe) { /* Little endian? */
- le = true;
- str += 2;
- (*len)-=2;
- } else if(tmp == 0xfeff) { /* Big endian? */
- str += 2;
- (*len)-=2;
- } else
- /* If there is no BOM (which is a specification violation),
- let's try to guess it. If one of the bytes is 0x00, it is
- probably the most significant one. */
- if(str[1] == 0)
- le = true;
+ if (utf16_has_bom(str, &le))
+ {
+ str += BOM_UTF_16_SIZE;
+ *len -= BOM_UTF_16_SIZE;
+ }
while ((i < *len) && (str[0] || str[1])) {
if(le)
@@ -734,17 +721,17 @@ static bool is_cuesheet(char *tag, unsigned char *char_enc, unsigned char *cuesh
switch (*(tag++))
{
case 0x01:
- if (!memcmp(tag, BOM_UTF_16_BE, BOM_UTF_16_SIZE))
- *char_enc = CHAR_ENC_UTF_16_BE;
- else if (!memcmp(tag, BOM_UTF_16_LE, BOM_UTF_16_SIZE))
- *char_enc = CHAR_ENC_UTF_16_LE;
- else
+ {
+ bool le;
+ if (!utf16_has_bom(tag, &le))
return false;
+ *char_enc = le ? CHAR_ENC_UTF_16_LE: CHAR_ENC_UTF_16_BE;
tag+= BOM_UTF_16_SIZE;
/* \1 + BOM(2) + C0U0E0S0H0E0E0T000 = 21 */
*cuesheet_offset = 21;
break;
+ }
case 0x02:
*char_enc = CHAR_ENC_UTF_16_BE;