diff options
author | Roman Artiukhin <bahusdrive@gmail.com> | 2024-12-11 16:00:55 +0200 |
---|---|---|
committer | Roman Artiukhin <bahusdrive@gmail.com> | 2024-12-13 17:38:05 +0200 |
commit | a23002cd5e8dc555cd9d8f5c5df46839614bbe83 (patch) | |
tree | 2879f618ac9a3a670639d868f47ea2cc5ffad6f4 | |
parent | 11fbbc78262b1ec8498cf11a307a9233274ddcfe (diff) | |
download | rockbox-a23002cd5e.tar.gz rockbox-a23002cd5e.zip |
unicode: Unify check for UTF-16 BOM
Adds utf16_has_bom function
Change-Id: I67ea474c9cf6ca6e6684351c2f54131164b7903c
-rw-r--r-- | apps/cuesheet.c | 15 | ||||
-rw-r--r-- | firmware/common/unicode.c | 22 | ||||
-rw-r--r-- | firmware/include/rbunicode.h | 2 | ||||
-rw-r--r-- | lib/rbcodec/metadata/id3tags.c | 37 |
4 files changed, 43 insertions, 33 deletions
diff --git a/apps/cuesheet.c b/apps/cuesheet.c index 2fe24d0a40..227ef5902c 100644 --- a/apps/cuesheet.c +++ b/apps/cuesheet.c @@ -231,15 +231,14 @@ bool parse_cuesheet(struct cuesheet_file *cue_file, struct cuesheet *cue) char_enc = CHAR_ENC_UTF_8; bom_read = BOM_UTF_8_SIZE; } - else if(!memcmp(line, BOM_UTF_16_LE, BOM_UTF_16_SIZE)) - { - char_enc = CHAR_ENC_UTF_16_LE; - bom_read = BOM_UTF_16_SIZE; - } - else if(!memcmp(line, BOM_UTF_16_BE, BOM_UTF_16_SIZE)) + else { - char_enc = CHAR_ENC_UTF_16_BE; - bom_read = BOM_UTF_16_SIZE; + bool le; + if (utf16_has_bom(line, &le)) + { + char_enc = le ? CHAR_ENC_UTF_16_LE : CHAR_ENC_UTF_16_BE; + bom_read = BOM_UTF_16_SIZE; + } } } diff --git a/firmware/common/unicode.c b/firmware/common/unicode.c index 1ed2e5e49d..1e719c56eb 100644 --- a/firmware/common/unicode.c +++ b/firmware/common/unicode.c @@ -437,6 +437,28 @@ unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8, return utf8; } +bool utf16_has_bom(const unsigned char *utf16, bool *le) +{ + unsigned long ucs = utf16[0] << 8 | utf16[1]; + + if (ucs == 0xFEFF) /* Check for BOM */ + { + *le = false; + return true; + } + + if (ucs == 0xFFFE) + { + *le = true; + return true; + } + + /* If there is no BOM let's try to guess it. If one of the bytes is 0x00, it is + probably the most significant one. */ + *le = utf16[1] == 0; + return false; +} + #if 0 /* currently unused */ /* Recode any UTF-16 string to UTF-8 */ unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8, diff --git a/firmware/include/rbunicode.h b/firmware/include/rbunicode.h index 1af560abfe..6dae7f169f 100644 --- a/firmware/include/rbunicode.h +++ b/firmware/include/rbunicode.h @@ -29,6 +29,7 @@ #define _RBUNICODE_H_ #include "config.h" +#include <stdbool.h> #define MASK 0xC0 /* 11000000 */ #define COMP 0x80 /* 10x */ @@ -58,6 +59,7 @@ unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8); unsigned char* iso_decode(const unsigned char *latin1, unsigned char *utf8, int cp, int count); unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8, int count); unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8, int count); +bool utf16_has_bom(const unsigned char *utf16, bool *le); unsigned long utf8length(const unsigned char *utf8); const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs); void set_codepage(int cp); diff --git a/lib/rbcodec/metadata/id3tags.c b/lib/rbcodec/metadata/id3tags.c index 588fdbef56..7006f5e739 100644 --- a/lib/rbcodec/metadata/id3tags.c +++ b/lib/rbcodec/metadata/id3tags.c @@ -570,8 +570,6 @@ static bool parse_as_utf8(char* string, int *len) string. If it is, we convert it to a UTF-8 string. If it's not unicode, we convert from the default codepage */ static void unicode_munge(char* string, char* utf8buf, int *len) { - long tmp; - bool le = false; int i = 0; unsigned char *str = (unsigned char *)string; int templen = 0; @@ -590,28 +588,17 @@ static void unicode_munge(char* string, char* utf8buf, int *len) { case 0x02: (*len)--; str++; + bool le; + /* Handle frames with more than one string (needed for TXXX frames).*/ do { - tmp = bytes2int(0, 0, str[0], str[1]); - - /* Now check if there is a BOM - (zero-width non-breaking space, 0xfeff) - and if it is in little or big endian format */ - if(tmp == 0xfffe) { /* Little endian? */ - le = true; - str += 2; - (*len)-=2; - } else if(tmp == 0xfeff) { /* Big endian? */ - str += 2; - (*len)-=2; - } else - /* If there is no BOM (which is a specification violation), - let's try to guess it. If one of the bytes is 0x00, it is - probably the most significant one. */ - if(str[1] == 0) - le = true; + if (utf16_has_bom(str, &le)) + { + str += BOM_UTF_16_SIZE; + *len -= BOM_UTF_16_SIZE; + } while ((i < *len) && (str[0] || str[1])) { if(le) @@ -734,17 +721,17 @@ static bool is_cuesheet(char *tag, unsigned char *char_enc, unsigned char *cuesh switch (*(tag++)) { case 0x01: - if (!memcmp(tag, BOM_UTF_16_BE, BOM_UTF_16_SIZE)) - *char_enc = CHAR_ENC_UTF_16_BE; - else if (!memcmp(tag, BOM_UTF_16_LE, BOM_UTF_16_SIZE)) - *char_enc = CHAR_ENC_UTF_16_LE; - else + { + bool le; + if (!utf16_has_bom(tag, &le)) return false; + *char_enc = le ? CHAR_ENC_UTF_16_LE: CHAR_ENC_UTF_16_BE; tag+= BOM_UTF_16_SIZE; /* \1 + BOM(2) + C0U0E0S0H0E0E0T000 = 21 */ *cuesheet_offset = 21; break; + } case 0x02: *char_enc = CHAR_ENC_UTF_16_BE; |