From 8cb5ccad99aa4660dbace9025a8e1c0223358f08 Mon Sep 17 00:00:00 2001 From: tailhook Date: Fri, 15 Apr 2011 17:36:17 +0300 Subject: [PATCH 1/4] Implemented encoding for strings * Packer by default uses `utf-8` encoding by default * Unpacker uses `None` by default, so no decoding is done * Both pack and unpack has `encoding` and `unicode_errors` arguments, if `encoding` is `None` no encoding/decoding is done, otherwise it is python codec. `unicode_errors` is supplied as `errors` parameter to codec --- python/msgpack/_msgpack.pyx | 72 ++++++++++++++++++++++++++++--------- python/msgpack/unpack.h | 8 ++++- python/test/test_pack.py | 57 ++++++++++++++++++++++++++--- python/test3/test_obj.py | 2 +- python/test3/test_pack.py | 55 ++++++++++++++++++++++++++-- 5 files changed, 169 insertions(+), 25 deletions(-) diff --git a/python/msgpack/_msgpack.pyx b/python/msgpack/_msgpack.pyx index cdcd0c81..443cbd75 100644 --- a/python/msgpack/_msgpack.pyx +++ b/python/msgpack/_msgpack.pyx @@ -36,7 +36,7 @@ cdef int DEFAULT_RECURSE_LIMIT=511 cdef class Packer(object): """MessagePack Packer - + usage: packer = Packer() @@ -45,6 +45,10 @@ cdef class Packer(object): """ cdef msgpack_packer pk cdef object _default + cdef object _bencoding + cdef object _berrors + cdef char *encoding + cdef char *unicode_errors def __cinit__(self): cdef int buf_size = 1024*1024 @@ -54,11 +58,25 @@ cdef class Packer(object): self.pk.buf_size = buf_size self.pk.length = 0 - def __init__(self, default=None): + def __init__(self, default=None, encoding='utf-8', unicode_errors='strict'): if default is not None: if not PyCallable_Check(default): raise TypeError("default must be a callable.") self._default = default + if encoding is None: + self.encoding = NULL + self.unicode_errors = NULL + else: + if isinstance(encoding, unicode): + self._bencoding = encoding.encode('ascii') + else: + self._bencoding = encoding + self.encoding = PyBytes_AsString(self._bencoding) + if isinstance(unicode_errors, unicode): + self._berrors = unicode_errors.encode('ascii') + else: + self._berrors = unicode_errors + self.unicode_errors = PyBytes_AsString(self._berrors) def __dealloc__(self): free(self.pk.buf); @@ -68,7 +86,7 @@ cdef class Packer(object): cdef unsigned long long ullval cdef long longval cdef double fval - cdef char* rawval + cdef char* rawval cdef int ret cdef dict d @@ -101,7 +119,9 @@ cdef class Packer(object): if ret == 0: ret = msgpack_pack_raw_body(&self.pk, rawval, len(o)) elif PyUnicode_Check(o): - o = PyUnicode_AsUTF8String(o) + if not self.encoding: + raise TypeError("Can't encode utf-8 no encoding is specified") + o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors) rawval = o ret = msgpack_pack_raw(&self.pk, len(o)) if ret == 0: @@ -138,14 +158,14 @@ cdef class Packer(object): return buf -def pack(object o, object stream, default=None): +def pack(object o, object stream, default=None, encoding='utf-8', unicode_errors='strict'): """pack an object `o` and write it to stream).""" - packer = Packer(default=default) + packer = Packer(default=default, encoding=encoding, unicode_errors=unicode_errors) stream.write(packer.pack(o)) -def packb(object o, default=None): +def packb(object o, default=None, encoding='utf-8', unicode_errors='strict'): """pack o and return packed bytes.""" - packer = Packer(default=default) + packer = Packer(default=default, encoding=encoding, unicode_errors=unicode_errors) return packer.pack(o) dumps = packs = packb @@ -155,6 +175,8 @@ cdef extern from "unpack.h": int use_list PyObject* object_hook PyObject* list_hook + char *encoding + char *unicode_errors ctypedef struct template_context: msgpack_user user @@ -164,12 +186,12 @@ cdef extern from "unpack.h": PyObject* key int template_execute(template_context* ctx, const_char_ptr data, - size_t len, size_t* off) + size_t len, size_t* off) except -1 void template_init(template_context* ctx) object template_data(template_context* ctx) -def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=0): +def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=0, encoding=None, unicode_errors="strict"): """Unpack packed_bytes to object. Returns an unpacked object.""" cdef template_context ctx cdef size_t off = 0 @@ -179,9 +201,25 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint cdef Py_ssize_t buf_len PyObject_AsReadBuffer(packed, &buf, &buf_len) + if encoding is None: + enc = NULL + else: + if isinstance(encoding, unicode): + bencoding = encoding.encode('ascii') + else: + bencoding = encoding + if isinstance(unicode_errors, unicode): + berrors = unicode_errors.encode('ascii') + else: + berrors = unicode_errors + enc = PyBytes_AsString(bencoding) + err = PyBytes_AsString(berrors) + template_init(&ctx) ctx.user.use_list = use_list ctx.user.object_hook = ctx.user.list_hook = NULL + ctx.user.encoding = enc + ctx.user.unicode_errors = err if object_hook is not None: if not PyCallable_Check(object_hook): raise TypeError("object_hook must be a callable.") @@ -191,8 +229,10 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint raise TypeError("list_hook must be a callable.") ctx.user.list_hook = list_hook _gc_disable() - ret = template_execute(&ctx, buf, buf_len, &off) - _gc_enable() + try: + ret = template_execute(&ctx, buf, buf_len, &off) + finally: + _gc_enable() if ret == 1: return template_data(&ctx) else: @@ -200,10 +240,10 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint loads = unpacks = unpackb -def unpack(object stream, object object_hook=None, object list_hook=None, bint use_list=0): +def unpack(object stream, object object_hook=None, object list_hook=None, bint use_list=0, encoding=None, unicode_errors="strict"): """unpack an object from stream.""" return unpackb(stream.read(), use_list=use_list, - object_hook=object_hook, list_hook=list_hook) + object_hook=object_hook, list_hook=list_hook, encoding=encoding, unicode_errors=unicode_errors) cdef class Unpacker(object): """Unpacker(read_size=1024*1024) @@ -236,7 +276,7 @@ cdef class Unpacker(object): self.buf = NULL; def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=0, - object object_hook=None, object list_hook=None): + object object_hook=None, object list_hook=None, encoding=None, unicode_errors=None): if read_size == 0: read_size = 1024*1024 self.use_list = use_list @@ -292,7 +332,7 @@ cdef class Unpacker(object): new_size = tail + _buf_len if new_size < buf_size*2: new_size = buf_size*2 - buf = realloc(buf, new_size) + buf = realloc(buf, new_size) if buf == NULL: # self.buf still holds old buffer and will be freed during # obj destruction diff --git a/python/msgpack/unpack.h b/python/msgpack/unpack.h index 453ec2b8..0586ca86 100644 --- a/python/msgpack/unpack.h +++ b/python/msgpack/unpack.h @@ -23,6 +23,8 @@ typedef struct unpack_user { int use_list; PyObject *object_hook; PyObject *list_hook; + const char *encoding; + const char *unicode_errors; } unpack_user; @@ -197,7 +199,11 @@ static inline int template_callback_map_end(unpack_user* u, msgpack_unpack_objec static inline int template_callback_raw(unpack_user* u, const char* b, const char* p, unsigned int l, msgpack_unpack_object* o) { PyObject *py; - py = PyBytes_FromStringAndSize(p, l); + if(u->encoding) { + py = PyUnicode_Decode(p, l, u->encoding, u->unicode_errors); + } else { + py = PyBytes_FromStringAndSize(p, l); + } if (!py) return -1; *o = py; diff --git a/python/test/test_pack.py b/python/test/test_pack.py index 5dec0680..2aef588b 100644 --- a/python/test/test_pack.py +++ b/python/test/test_pack.py @@ -15,14 +15,63 @@ def testPack(): 0, 1, 127, 128, 255, 256, 65535, 65536, -1, -32, -33, -128, -129, -32768, -32769, 1.0, - "", "a", "a"*31, "a"*32, + b"", b"a", b"a"*31, b"a"*32, None, True, False, - (), ((),), ((), None,), - {None: 0}, - (1<<23), + (), ((),), ((), None,), + {None: 0}, + (1<<23), ] for td in test_data: check(td) +def testPackUnicode(): + test_data = [ + u"", u"abcd", (u"defgh",), u"Русский текст", + ] + for td in test_data: + re = unpacks(packs(td, encoding='utf-8'), encoding='utf-8') + assert_equal(re, td) + +def testPackUTF32(): + test_data = [ + u"", u"abcd", (u"defgh",), u"Русский текст", + ] + for td in test_data: + print(packs(td, encoding='utf-32')) + re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32') + assert_equal(re, td) + +def testPackBytes(): + test_data = [ + b"", b"abcd", (b"defgh",), + ] + for td in test_data: + check(td) + +def testIgnoreUnicodeErrors(): + re = unpacks(packs(b'abc\xeddef'), + encoding='utf-8', unicode_errors='ignore') + assert_equal(re, "abcdef") + +@raises(UnicodeDecodeError) +def testStrictUnicodeUnpack(): + unpacks(packs(b'abc\xeddef'), encoding='utf-8') + +@raises(UnicodeEncodeError) +def testStrictUnicodePack(): + packs(u"abc\xeddef", encoding='ascii', unicode_errors='strict') + +def testIgnoreErrorsPack(): + re = unpacks(packs(u"abcФФФdef", encoding='ascii', unicode_errors='ignore'), encoding='utf-8') + assert_equal(re, u"abcdef") + +@raises(TypeError) +def testNoEncoding(): + packs(u"abc", encoding=None) + +def testDecodeBinary(): + re = unpacks(packs(u"abc"), encoding=None) + assert_equal(re, b"abc") + if __name__ == '__main__': main() diff --git a/python/test3/test_obj.py b/python/test3/test_obj.py index 236988de..b54021f2 100644 --- a/python/test3/test_obj.py +++ b/python/test3/test_obj.py @@ -26,7 +26,7 @@ def test_decode_hook(): unpacked = unpacks(packed, object_hook=_decode_complex) eq_(unpacked[1], 1+2j) -@raises(TypeError) +@raises(ValueError) def test_bad_hook(): packed = packs([3, 1+2j], default=lambda o: o) unpacked = unpacks(packed) diff --git a/python/test3/test_pack.py b/python/test3/test_pack.py index c861704b..e53f7e64 100644 --- a/python/test3/test_pack.py +++ b/python/test3/test_pack.py @@ -17,12 +17,61 @@ def testPack(): 1.0, b"", b"a", b"a"*31, b"a"*32, None, True, False, - (), ((),), ((), None,), - {None: 0}, - (1<<23), + (), ((),), ((), None,), + {None: 0}, + (1<<23), ] for td in test_data: check(td) +def testPackUnicode(): + test_data = [ + "", "abcd", ("defgh",), "Русский текст", + ] + for td in test_data: + re = unpacks(packs(td, encoding='utf-8'), encoding='utf-8') + assert_equal(re, td) + +def testPackUTF32(): + test_data = [ + "", "abcd", ("defgh",), "Русский текст", + ] + for td in test_data: + print(packs(td, encoding='utf-32')) + re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32') + assert_equal(re, td) + +def testPackBytes(): + test_data = [ + b"", b"abcd", (b"defgh",), + ] + for td in test_data: + check(td) + +def testIgnoreUnicodeErrors(): + re = unpacks(packs(b'abc\xeddef'), + encoding='utf-8', unicode_errors='ignore') + assert_equal(re, "abcdef") + +@raises(UnicodeDecodeError) +def testStrictUnicodeUnpack(): + unpacks(packs(b'abc\xeddef'), encoding='utf-8') + +@raises(UnicodeEncodeError) +def testStrictUnicodePack(): + packs("abc\xeddef", encoding='ascii', unicode_errors='strict') + +def testIgnoreErrorsPack(): + re = unpacks(packs("abcФФФdef", encoding='ascii', unicode_errors='ignore'), encoding='utf-8') + assert_equal(re, "abcdef") + +@raises(TypeError) +def testNoEncoding(): + packs("abc", encoding=None) + +def testDecodeBinary(): + re = unpacks(packs("abc"), encoding=None) + assert_equal(re, b"abc") + if __name__ == '__main__': main() From bd73742552cf16592662a7ec5ba3608888081131 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Tue, 31 May 2011 14:10:46 +0900 Subject: [PATCH 2/4] (python) Change error message for unicode is passed but no encoding is specified. --- python/msgpack/_msgpack.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/msgpack/_msgpack.pyx b/python/msgpack/_msgpack.pyx index 443cbd75..14bc9d77 100644 --- a/python/msgpack/_msgpack.pyx +++ b/python/msgpack/_msgpack.pyx @@ -120,7 +120,7 @@ cdef class Packer(object): ret = msgpack_pack_raw_body(&self.pk, rawval, len(o)) elif PyUnicode_Check(o): if not self.encoding: - raise TypeError("Can't encode utf-8 no encoding is specified") + raise TypeError("Can't pack unicode object: No encoding is specified") o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors) rawval = o ret = msgpack_pack_raw(&self.pk, len(o)) From 709d0cc33e7ac5c2029bca17ee2e4b0e4b1df55d Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Tue, 31 May 2011 15:40:11 +0900 Subject: [PATCH 3/4] Revert "(python) Change error message for unicode is passed but no encoding is" This reverts commit bd73742552cf16592662a7ec5ba3608888081131. --- python/ChangeLog.rst | 10 ++++++++++ python/msgpack/_msgpack.pyx | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/python/ChangeLog.rst b/python/ChangeLog.rst index 75e86b27..a0aae257 100644 --- a/python/ChangeLog.rst +++ b/python/ChangeLog.rst @@ -1,3 +1,13 @@ +0.1.10 +====== +:release date: NOT RELEASED YET + +New feature +----------- +* Add ``encoding`` and ``unicode_erros`` option to packer and unpacker. + When this option is specified, (un)packs unicode object instead of bytes. + This enables using msgpack as a replacement of json. + 0.1.9 ====== :release date: 2011-01-29 diff --git a/python/msgpack/_msgpack.pyx b/python/msgpack/_msgpack.pyx index 14bc9d77..443cbd75 100644 --- a/python/msgpack/_msgpack.pyx +++ b/python/msgpack/_msgpack.pyx @@ -120,7 +120,7 @@ cdef class Packer(object): ret = msgpack_pack_raw_body(&self.pk, rawval, len(o)) elif PyUnicode_Check(o): if not self.encoding: - raise TypeError("Can't pack unicode object: No encoding is specified") + raise TypeError("Can't encode utf-8 no encoding is specified") o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors) rawval = o ret = msgpack_pack_raw(&self.pk, len(o)) From be6d6560a7d0b1da6545ca8fff8a477348fdc52a Mon Sep 17 00:00:00 2001 From: inada-n Date: Wed, 1 Jun 2011 18:30:43 +0900 Subject: [PATCH 4/4] (python) make test pass with Python 2.5 --- python/test/test_pack.py | 33 +++++++++++++++++++-------------- python/test/test_sequnpack.py | 20 ++++++++++---------- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/python/test/test_pack.py b/python/test/test_pack.py index 2aef588b..2b5f1ade 100644 --- a/python/test/test_pack.py +++ b/python/test/test_pack.py @@ -3,6 +3,7 @@ from nose import main from nose.tools import * +from nose.plugins.skip import SkipTest from msgpack import packs, unpacks @@ -15,7 +16,7 @@ def testPack(): 0, 1, 127, 128, 255, 256, 65535, 65536, -1, -32, -33, -128, -129, -32768, -32769, 1.0, - b"", b"a", b"a"*31, b"a"*32, + "", "a", "a"*31, "a"*32, None, True, False, (), ((),), ((), None,), {None: 0}, @@ -33,36 +34,40 @@ def testPackUnicode(): assert_equal(re, td) def testPackUTF32(): - test_data = [ - u"", u"abcd", (u"defgh",), u"Русский текст", - ] - for td in test_data: - print(packs(td, encoding='utf-32')) - re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32') - assert_equal(re, td) + try: + test_data = [ + u"", u"abcd", (u"defgh",), u"Русский текст", + ] + for td in test_data: + re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32') + assert_equal(re, td) + except LookupError: + raise SkipTest def testPackBytes(): test_data = [ - b"", b"abcd", (b"defgh",), + "", "abcd", ("defgh",), ] for td in test_data: check(td) def testIgnoreUnicodeErrors(): - re = unpacks(packs(b'abc\xeddef'), - encoding='utf-8', unicode_errors='ignore') + re = unpacks(packs('abc\xeddef'), + encoding='ascii', unicode_errors='ignore') assert_equal(re, "abcdef") @raises(UnicodeDecodeError) def testStrictUnicodeUnpack(): - unpacks(packs(b'abc\xeddef'), encoding='utf-8') + unpacks(packs('abc\xeddef'), encoding='utf-8') @raises(UnicodeEncodeError) def testStrictUnicodePack(): packs(u"abc\xeddef", encoding='ascii', unicode_errors='strict') def testIgnoreErrorsPack(): - re = unpacks(packs(u"abcФФФdef", encoding='ascii', unicode_errors='ignore'), encoding='utf-8') + re = unpacks( + packs(u"abcФФФdef", encoding='ascii', unicode_errors='ignore'), + encoding='utf-8') assert_equal(re, u"abcdef") @raises(TypeError) @@ -71,7 +76,7 @@ def testNoEncoding(): def testDecodeBinary(): re = unpacks(packs(u"abc"), encoding=None) - assert_equal(re, b"abc") + assert_equal(re, "abc") if __name__ == '__main__': main() diff --git a/python/test/test_sequnpack.py b/python/test/test_sequnpack.py index c92658c1..d61be230 100644 --- a/python/test/test_sequnpack.py +++ b/python/test/test_sequnpack.py @@ -6,12 +6,12 @@ from msgpack import Unpacker def test_foobar(): unpacker = Unpacker(read_size=3) unpacker.feed('foobar') - assert unpacker.unpack() == ord(b'f') - assert unpacker.unpack() == ord(b'o') - assert unpacker.unpack() == ord(b'o') - assert unpacker.unpack() == ord(b'b') - assert unpacker.unpack() == ord(b'a') - assert unpacker.unpack() == ord(b'r') + assert unpacker.unpack() == ord('f') + assert unpacker.unpack() == ord('o') + assert unpacker.unpack() == ord('o') + assert unpacker.unpack() == ord('b') + assert unpacker.unpack() == ord('a') + assert unpacker.unpack() == ord('r') try: o = unpacker.unpack() print "Oops!", o @@ -20,14 +20,14 @@ def test_foobar(): assert 1 else: assert 0 - unpacker.feed(b'foo') - unpacker.feed(b'bar') + unpacker.feed('foo') + unpacker.feed('bar') k = 0 - for o, e in zip(unpacker, b'foobarbaz'): + for o, e in zip(unpacker, 'foobarbaz'): assert o == ord(e) k += 1 - assert k == len(b'foobar') + assert k == len('foobar') if __name__ == '__main__': test_foobar()