lavu/avstring: add av_utf8_decode() function
This commit is contained in:
parent
e782eea183
commit
68590650f0
@ -15,6 +15,9 @@ libavutil: 2012-10-22
|
||||
|
||||
API changes, most recent first:
|
||||
|
||||
2013-11-XX - xxxxxxx - lavu 52.54.100 - avstring.h
|
||||
Add av_utf8_decode() function.
|
||||
|
||||
2013-11-xx - xxxxxxx - lavc 55.44.100 - avcodec.h
|
||||
Add av_packet_{un,}pack_dictionary()
|
||||
Add AV_PKT_METADATA_UPDATE side data type, used to transmit key/value
|
||||
|
@ -157,6 +157,7 @@ TESTPROGS = adler32 \
|
||||
sha \
|
||||
sha512 \
|
||||
tree \
|
||||
utf8 \
|
||||
xtea \
|
||||
|
||||
TESTPROGS-$(HAVE_LZO1X_999_COMPRESS) += lzo
|
||||
|
@ -307,6 +307,70 @@ int av_isxdigit(int c)
|
||||
return av_isdigit(c) || (c >= 'a' && c <= 'f');
|
||||
}
|
||||
|
||||
int av_utf8_decode(int32_t *codep, const uint8_t **bufp, const uint8_t *buf_end,
|
||||
unsigned int flags)
|
||||
{
|
||||
const uint8_t *p = *bufp;
|
||||
uint32_t top;
|
||||
uint64_t code;
|
||||
int ret = 0;
|
||||
|
||||
if (p >= buf_end)
|
||||
return 0;
|
||||
|
||||
code = *p++;
|
||||
|
||||
/* first sequence byte starts with 10, or is 1111-1110 or 1111-1111,
|
||||
which is not admitted */
|
||||
if ((code & 0xc0) == 0x80 || code >= 0xFE) {
|
||||
ret = AVERROR(EILSEQ);
|
||||
goto end;
|
||||
}
|
||||
top = (code & 128) >> 1;
|
||||
|
||||
while (code & top) {
|
||||
int tmp;
|
||||
if (p >= buf_end) {
|
||||
ret = AVERROR(EILSEQ); /* incomplete sequence */
|
||||
goto end;
|
||||
}
|
||||
|
||||
/* we assume the byte to be in the form 10xx-xxxx */
|
||||
tmp = *p++ - 128; /* strip leading 1 */
|
||||
if (tmp>>6) {
|
||||
ret = AVERROR(EILSEQ);
|
||||
goto end;
|
||||
}
|
||||
code = (code<<6) + tmp;
|
||||
top <<= 5;
|
||||
}
|
||||
code &= (top << 1) - 1;
|
||||
|
||||
if (code >= 1<<31) {
|
||||
ret = AVERROR(EILSEQ); /* out-of-range value */
|
||||
goto end;
|
||||
}
|
||||
|
||||
*codep = code;
|
||||
|
||||
if (code > 0x10FFFF &&
|
||||
!(flags & AV_UTF8_FLAG_ACCEPT_INVALID_BIG_CODES))
|
||||
ret = AVERROR(EILSEQ);
|
||||
if (code < 0x20 && code != 0x9 && code != 0xA && code != 0xD &&
|
||||
flags & AV_UTF8_FLAG_EXCLUDE_XML_INVALID_CONTROL_CODES)
|
||||
ret = AVERROR(EILSEQ);
|
||||
if (code >= 0xD800 && code <= 0xDFFF &&
|
||||
!(flags & AV_UTF8_FLAG_ACCEPT_SURROGATES))
|
||||
ret = AVERROR(EILSEQ);
|
||||
if (code == 0xFFFE || code == 0xFFFF &&
|
||||
(!flags & AV_UTF8_FLAG_ACCEPT_NON_CHARACTERS))
|
||||
ret = AVERROR(EILSEQ);
|
||||
|
||||
end:
|
||||
*bufp = p;
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef TEST
|
||||
|
||||
int main(void)
|
||||
|
@ -22,6 +22,7 @@
|
||||
#define AVUTIL_AVSTRING_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include "attributes.h"
|
||||
|
||||
/**
|
||||
@ -295,6 +296,45 @@ enum AVEscapeMode {
|
||||
int av_escape(char **dst, const char *src, const char *special_chars,
|
||||
enum AVEscapeMode mode, int flags);
|
||||
|
||||
#define AV_UTF8_FLAG_ACCEPT_INVALID_BIG_CODES 1 ///< accept codepoints over 0x10FFFF
|
||||
#define AV_UTF8_FLAG_ACCEPT_NON_CHARACTERS 2 ///< accept non-characters - 0xFFFE and 0xFFFF
|
||||
#define AV_UTF8_FLAG_ACCEPT_SURROGATES 4 ///< accept UTF-16 surrogates codes
|
||||
#define AV_UTF8_FLAG_EXCLUDE_XML_INVALID_CONTROL_CODES 8 ///< exclude control codes not accepted by XML
|
||||
|
||||
#define AV_UTF8_FLAG_ACCEPT_ALL \
|
||||
AV_UTF8_FLAG_ACCEPT_INVALID_BIG_CODES|AV_UTF8_FLAG_ACCEPT_NON_CHARACTERS|AV_UTF8_FLAG_ACCEPT_SURROGATES
|
||||
|
||||
/**
|
||||
* Read and decode a single UTF-8 code point (character) from the
|
||||
* buffer in *buf, and update *buf to point to the next byte to
|
||||
* decode.
|
||||
*
|
||||
* In case of an invalid byte sequence, the pointer will be updated to
|
||||
* the next byte after the invalid sequence and the function will
|
||||
* return an error code.
|
||||
*
|
||||
* Depending on the specified flags, the function will also fail in
|
||||
* case the decoded code point does not belong to a valid range.
|
||||
*
|
||||
* @note For speed-relevant code a carefully implemented use of
|
||||
* GET_UTF8() may be preferred.
|
||||
*
|
||||
* @param codep pointer used to return the parsed code in case of success.
|
||||
* The value in *codep is set even in case the range check fails.
|
||||
* @param bufp pointer to the address the first byte of the sequence
|
||||
* to decode, updated by the function to point to the
|
||||
* byte next after the decoded sequence
|
||||
* @param buf_end pointer to the end of the buffer, points to the next
|
||||
* byte past the last in the buffer. This is used to
|
||||
* avoid buffer overreads (in case of an unfinished
|
||||
* UTF-8 sequence towards the end of the buffer).
|
||||
* @param flags a collection of AV_UTF8_FLAG_* flags
|
||||
* @return >= 0 in case a sequence was successfully read, a negative
|
||||
* value in case of invalid sequence
|
||||
*/
|
||||
int av_utf8_decode(int32_t *codep, const uint8_t **bufp, const uint8_t *buf_end,
|
||||
unsigned int flags);
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
71
libavutil/utf8.c
Normal file
71
libavutil/utf8.c
Normal file
@ -0,0 +1,71 @@
|
||||
/*
|
||||
* Copyright (c) 2013 Stefano Sabatini
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "libavutil/avstring.h"
|
||||
#include "libavutil/file.h"
|
||||
|
||||
static void print_sequence(const char *p, int l, int indent)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < l; i++)
|
||||
printf("%02X", (uint8_t)p[i]);
|
||||
printf("%*s", indent-l*2, "");
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int ret;
|
||||
char *filename = argv[1];
|
||||
uint8_t *file_buf;
|
||||
size_t file_buf_size;
|
||||
uint32_t code;
|
||||
const uint8_t *p, *endp;
|
||||
|
||||
ret = av_file_map(filename, &file_buf, &file_buf_size, 0, NULL);
|
||||
if (ret < 0)
|
||||
return 1;
|
||||
|
||||
p = file_buf;
|
||||
endp = file_buf + file_buf_size;
|
||||
while (p < endp) {
|
||||
int l, r;
|
||||
const uint8_t *p0 = p;
|
||||
code = UINT32_MAX;
|
||||
r = av_utf8_decode(&code, &p, endp, 0);
|
||||
l = (int)(p-p0);
|
||||
print_sequence(p0, l, 20);
|
||||
if (code != UINT32_MAX) {
|
||||
printf("%-10d 0x%-10X %-5d ", code, code, l);
|
||||
if (r >= 0) {
|
||||
if (*p0 == '\n') printf("\\n\n");
|
||||
else printf ("%.*s\n", l, p0);
|
||||
} else {
|
||||
printf("invalid code range\n");
|
||||
}
|
||||
} else {
|
||||
printf("invalid sequence\n");
|
||||
}
|
||||
}
|
||||
|
||||
av_file_unmap(file_buf, file_buf_size);
|
||||
return 0;
|
||||
}
|
@ -75,7 +75,7 @@
|
||||
*/
|
||||
|
||||
#define LIBAVUTIL_VERSION_MAJOR 52
|
||||
#define LIBAVUTIL_VERSION_MINOR 53
|
||||
#define LIBAVUTIL_VERSION_MINOR 54
|
||||
#define LIBAVUTIL_VERSION_MICRO 100
|
||||
|
||||
#define LIBAVUTIL_VERSION_INT AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
|
||||
|
Loading…
Reference in New Issue
Block a user