diff --git a/python/ChangeLog.rst b/python/ChangeLog.rst index 75e86b27..a0aae257 100644 --- a/python/ChangeLog.rst +++ b/python/ChangeLog.rst @@ -1,3 +1,13 @@ +0.1.10 +====== +:release date: NOT RELEASED YET + +New feature +----------- +* Add ``encoding`` and ``unicode_erros`` option to packer and unpacker. + When this option is specified, (un)packs unicode object instead of bytes. + This enables using msgpack as a replacement of json. + 0.1.9 ====== :release date: 2011-01-29 diff --git a/python/msgpack/_msgpack.pyx b/python/msgpack/_msgpack.pyx index cdcd0c81..443cbd75 100644 --- a/python/msgpack/_msgpack.pyx +++ b/python/msgpack/_msgpack.pyx @@ -36,7 +36,7 @@ cdef int DEFAULT_RECURSE_LIMIT=511 cdef class Packer(object): """MessagePack Packer - + usage: packer = Packer() @@ -45,6 +45,10 @@ cdef class Packer(object): """ cdef msgpack_packer pk cdef object _default + cdef object _bencoding + cdef object _berrors + cdef char *encoding + cdef char *unicode_errors def __cinit__(self): cdef int buf_size = 1024*1024 @@ -54,11 +58,25 @@ cdef class Packer(object): self.pk.buf_size = buf_size self.pk.length = 0 - def __init__(self, default=None): + def __init__(self, default=None, encoding='utf-8', unicode_errors='strict'): if default is not None: if not PyCallable_Check(default): raise TypeError("default must be a callable.") self._default = default + if encoding is None: + self.encoding = NULL + self.unicode_errors = NULL + else: + if isinstance(encoding, unicode): + self._bencoding = encoding.encode('ascii') + else: + self._bencoding = encoding + self.encoding = PyBytes_AsString(self._bencoding) + if isinstance(unicode_errors, unicode): + self._berrors = unicode_errors.encode('ascii') + else: + self._berrors = unicode_errors + self.unicode_errors = PyBytes_AsString(self._berrors) def __dealloc__(self): free(self.pk.buf); @@ -68,7 +86,7 @@ cdef class Packer(object): cdef unsigned long long ullval cdef long longval cdef double fval - cdef char* rawval + cdef char* rawval cdef int ret cdef dict d @@ -101,7 +119,9 @@ cdef class Packer(object): if ret == 0: ret = msgpack_pack_raw_body(&self.pk, rawval, len(o)) elif PyUnicode_Check(o): - o = PyUnicode_AsUTF8String(o) + if not self.encoding: + raise TypeError("Can't encode utf-8 no encoding is specified") + o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors) rawval = o ret = msgpack_pack_raw(&self.pk, len(o)) if ret == 0: @@ -138,14 +158,14 @@ cdef class Packer(object): return buf -def pack(object o, object stream, default=None): +def pack(object o, object stream, default=None, encoding='utf-8', unicode_errors='strict'): """pack an object `o` and write it to stream).""" - packer = Packer(default=default) + packer = Packer(default=default, encoding=encoding, unicode_errors=unicode_errors) stream.write(packer.pack(o)) -def packb(object o, default=None): +def packb(object o, default=None, encoding='utf-8', unicode_errors='strict'): """pack o and return packed bytes.""" - packer = Packer(default=default) + packer = Packer(default=default, encoding=encoding, unicode_errors=unicode_errors) return packer.pack(o) dumps = packs = packb @@ -155,6 +175,8 @@ cdef extern from "unpack.h": int use_list PyObject* object_hook PyObject* list_hook + char *encoding + char *unicode_errors ctypedef struct template_context: msgpack_user user @@ -164,12 +186,12 @@ cdef extern from "unpack.h": PyObject* key int template_execute(template_context* ctx, const_char_ptr data, - size_t len, size_t* off) + size_t len, size_t* off) except -1 void template_init(template_context* ctx) object template_data(template_context* ctx) -def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=0): +def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=0, encoding=None, unicode_errors="strict"): """Unpack packed_bytes to object. Returns an unpacked object.""" cdef template_context ctx cdef size_t off = 0 @@ -179,9 +201,25 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint cdef Py_ssize_t buf_len PyObject_AsReadBuffer(packed, &buf, &buf_len) + if encoding is None: + enc = NULL + else: + if isinstance(encoding, unicode): + bencoding = encoding.encode('ascii') + else: + bencoding = encoding + if isinstance(unicode_errors, unicode): + berrors = unicode_errors.encode('ascii') + else: + berrors = unicode_errors + enc = PyBytes_AsString(bencoding) + err = PyBytes_AsString(berrors) + template_init(&ctx) ctx.user.use_list = use_list ctx.user.object_hook = ctx.user.list_hook = NULL + ctx.user.encoding = enc + ctx.user.unicode_errors = err if object_hook is not None: if not PyCallable_Check(object_hook): raise TypeError("object_hook must be a callable.") @@ -191,8 +229,10 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint raise TypeError("list_hook must be a callable.") ctx.user.list_hook = list_hook _gc_disable() - ret = template_execute(&ctx, buf, buf_len, &off) - _gc_enable() + try: + ret = template_execute(&ctx, buf, buf_len, &off) + finally: + _gc_enable() if ret == 1: return template_data(&ctx) else: @@ -200,10 +240,10 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint loads = unpacks = unpackb -def unpack(object stream, object object_hook=None, object list_hook=None, bint use_list=0): +def unpack(object stream, object object_hook=None, object list_hook=None, bint use_list=0, encoding=None, unicode_errors="strict"): """unpack an object from stream.""" return unpackb(stream.read(), use_list=use_list, - object_hook=object_hook, list_hook=list_hook) + object_hook=object_hook, list_hook=list_hook, encoding=encoding, unicode_errors=unicode_errors) cdef class Unpacker(object): """Unpacker(read_size=1024*1024) @@ -236,7 +276,7 @@ cdef class Unpacker(object): self.buf = NULL; def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=0, - object object_hook=None, object list_hook=None): + object object_hook=None, object list_hook=None, encoding=None, unicode_errors=None): if read_size == 0: read_size = 1024*1024 self.use_list = use_list @@ -292,7 +332,7 @@ cdef class Unpacker(object): new_size = tail + _buf_len if new_size < buf_size*2: new_size = buf_size*2 - buf = realloc(buf, new_size) + buf = realloc(buf, new_size) if buf == NULL: # self.buf still holds old buffer and will be freed during # obj destruction diff --git a/python/msgpack/unpack.h b/python/msgpack/unpack.h index 453ec2b8..0586ca86 100644 --- a/python/msgpack/unpack.h +++ b/python/msgpack/unpack.h @@ -23,6 +23,8 @@ typedef struct unpack_user { int use_list; PyObject *object_hook; PyObject *list_hook; + const char *encoding; + const char *unicode_errors; } unpack_user; @@ -197,7 +199,11 @@ static inline int template_callback_map_end(unpack_user* u, msgpack_unpack_objec static inline int template_callback_raw(unpack_user* u, const char* b, const char* p, unsigned int l, msgpack_unpack_object* o) { PyObject *py; - py = PyBytes_FromStringAndSize(p, l); + if(u->encoding) { + py = PyUnicode_Decode(p, l, u->encoding, u->unicode_errors); + } else { + py = PyBytes_FromStringAndSize(p, l); + } if (!py) return -1; *o = py; diff --git a/python/test/test_pack.py b/python/test/test_pack.py index 5dec0680..2b5f1ade 100644 --- a/python/test/test_pack.py +++ b/python/test/test_pack.py @@ -3,6 +3,7 @@ from nose import main from nose.tools import * +from nose.plugins.skip import SkipTest from msgpack import packs, unpacks @@ -17,12 +18,65 @@ def testPack(): 1.0, "", "a", "a"*31, "a"*32, None, True, False, - (), ((),), ((), None,), - {None: 0}, - (1<<23), + (), ((),), ((), None,), + {None: 0}, + (1<<23), ] for td in test_data: check(td) +def testPackUnicode(): + test_data = [ + u"", u"abcd", (u"defgh",), u"Русский текст", + ] + for td in test_data: + re = unpacks(packs(td, encoding='utf-8'), encoding='utf-8') + assert_equal(re, td) + +def testPackUTF32(): + try: + test_data = [ + u"", u"abcd", (u"defgh",), u"Русский текст", + ] + for td in test_data: + re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32') + assert_equal(re, td) + except LookupError: + raise SkipTest + +def testPackBytes(): + test_data = [ + "", "abcd", ("defgh",), + ] + for td in test_data: + check(td) + +def testIgnoreUnicodeErrors(): + re = unpacks(packs('abc\xeddef'), + encoding='ascii', unicode_errors='ignore') + assert_equal(re, "abcdef") + +@raises(UnicodeDecodeError) +def testStrictUnicodeUnpack(): + unpacks(packs('abc\xeddef'), encoding='utf-8') + +@raises(UnicodeEncodeError) +def testStrictUnicodePack(): + packs(u"abc\xeddef", encoding='ascii', unicode_errors='strict') + +def testIgnoreErrorsPack(): + re = unpacks( + packs(u"abcФФФdef", encoding='ascii', unicode_errors='ignore'), + encoding='utf-8') + assert_equal(re, u"abcdef") + +@raises(TypeError) +def testNoEncoding(): + packs(u"abc", encoding=None) + +def testDecodeBinary(): + re = unpacks(packs(u"abc"), encoding=None) + assert_equal(re, "abc") + if __name__ == '__main__': main() diff --git a/python/test/test_sequnpack.py b/python/test/test_sequnpack.py index c92658c1..d61be230 100644 --- a/python/test/test_sequnpack.py +++ b/python/test/test_sequnpack.py @@ -6,12 +6,12 @@ from msgpack import Unpacker def test_foobar(): unpacker = Unpacker(read_size=3) unpacker.feed('foobar') - assert unpacker.unpack() == ord(b'f') - assert unpacker.unpack() == ord(b'o') - assert unpacker.unpack() == ord(b'o') - assert unpacker.unpack() == ord(b'b') - assert unpacker.unpack() == ord(b'a') - assert unpacker.unpack() == ord(b'r') + assert unpacker.unpack() == ord('f') + assert unpacker.unpack() == ord('o') + assert unpacker.unpack() == ord('o') + assert unpacker.unpack() == ord('b') + assert unpacker.unpack() == ord('a') + assert unpacker.unpack() == ord('r') try: o = unpacker.unpack() print "Oops!", o @@ -20,14 +20,14 @@ def test_foobar(): assert 1 else: assert 0 - unpacker.feed(b'foo') - unpacker.feed(b'bar') + unpacker.feed('foo') + unpacker.feed('bar') k = 0 - for o, e in zip(unpacker, b'foobarbaz'): + for o, e in zip(unpacker, 'foobarbaz'): assert o == ord(e) k += 1 - assert k == len(b'foobar') + assert k == len('foobar') if __name__ == '__main__': test_foobar() diff --git a/python/test3/test_obj.py b/python/test3/test_obj.py index 236988de..b54021f2 100644 --- a/python/test3/test_obj.py +++ b/python/test3/test_obj.py @@ -26,7 +26,7 @@ def test_decode_hook(): unpacked = unpacks(packed, object_hook=_decode_complex) eq_(unpacked[1], 1+2j) -@raises(TypeError) +@raises(ValueError) def test_bad_hook(): packed = packs([3, 1+2j], default=lambda o: o) unpacked = unpacks(packed) diff --git a/python/test3/test_pack.py b/python/test3/test_pack.py index c861704b..e53f7e64 100644 --- a/python/test3/test_pack.py +++ b/python/test3/test_pack.py @@ -17,12 +17,61 @@ def testPack(): 1.0, b"", b"a", b"a"*31, b"a"*32, None, True, False, - (), ((),), ((), None,), - {None: 0}, - (1<<23), + (), ((),), ((), None,), + {None: 0}, + (1<<23), ] for td in test_data: check(td) +def testPackUnicode(): + test_data = [ + "", "abcd", ("defgh",), "Русский текст", + ] + for td in test_data: + re = unpacks(packs(td, encoding='utf-8'), encoding='utf-8') + assert_equal(re, td) + +def testPackUTF32(): + test_data = [ + "", "abcd", ("defgh",), "Русский текст", + ] + for td in test_data: + print(packs(td, encoding='utf-32')) + re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32') + assert_equal(re, td) + +def testPackBytes(): + test_data = [ + b"", b"abcd", (b"defgh",), + ] + for td in test_data: + check(td) + +def testIgnoreUnicodeErrors(): + re = unpacks(packs(b'abc\xeddef'), + encoding='utf-8', unicode_errors='ignore') + assert_equal(re, "abcdef") + +@raises(UnicodeDecodeError) +def testStrictUnicodeUnpack(): + unpacks(packs(b'abc\xeddef'), encoding='utf-8') + +@raises(UnicodeEncodeError) +def testStrictUnicodePack(): + packs("abc\xeddef", encoding='ascii', unicode_errors='strict') + +def testIgnoreErrorsPack(): + re = unpacks(packs("abcФФФdef", encoding='ascii', unicode_errors='ignore'), encoding='utf-8') + assert_equal(re, "abcdef") + +@raises(TypeError) +def testNoEncoding(): + packs("abc", encoding=None) + +def testDecodeBinary(): + re = unpacks(packs("abc"), encoding=None) + assert_equal(re, b"abc") + if __name__ == '__main__': main()