Merge branch 'master' of git@github.com:msgpack/msgpack

2025-10-20 22:31:33 +02:00 · 2011-06-12 02:48:42 +09:00
parent 896ad51102 be6d6560a7
commit d70e64a434
7 changed files with 193 additions and 34 deletions
--- a/python/ChangeLog.rst
+++ b/python/ChangeLog.rst
@@ -1,3 +1,13 @@
+0.1.10
+======
+:release date: NOT RELEASED YET
+
+New feature
+-----------
+* Add ``encoding`` and ``unicode_erros`` option to packer and unpacker.
+  When this option is specified, (un)packs unicode object instead of bytes.
+  This enables using msgpack as a replacement of json.
+
 0.1.9
 ======
 :release date: 2011-01-29
--- a/python/msgpack/_msgpack.pyx
+++ b/python/msgpack/_msgpack.pyx
@@ -45,6 +45,10 @@ cdef class Packer(object):
    """
    cdef msgpack_packer pk
    cdef object _default
+    cdef object _bencoding
+    cdef object _berrors
+    cdef char *encoding
+    cdef char *unicode_errors

    def __cinit__(self):
        cdef int buf_size = 1024*1024
@@ -54,11 +58,25 @@ cdef class Packer(object):
        self.pk.buf_size = buf_size
        self.pk.length = 0

-    def __init__(self, default=None):
+    def __init__(self, default=None, encoding='utf-8', unicode_errors='strict'):
        if default is not None:
            if not PyCallable_Check(default):
                raise TypeError("default must be a callable.")
        self._default = default
+        if encoding is None:
+            self.encoding = NULL
+            self.unicode_errors = NULL
+        else:
+            if isinstance(encoding, unicode):
+                self._bencoding = encoding.encode('ascii')
+            else:
+                self._bencoding = encoding
+            self.encoding = PyBytes_AsString(self._bencoding)
+            if isinstance(unicode_errors, unicode):
+                self._berrors = unicode_errors.encode('ascii')
+            else:
+                self._berrors = unicode_errors
+            self.unicode_errors = PyBytes_AsString(self._berrors)

    def __dealloc__(self):
        free(self.pk.buf);
@@ -101,7 +119,9 @@ cdef class Packer(object):
            if ret == 0:
                ret = msgpack_pack_raw_body(&self.pk, rawval, len(o))
        elif PyUnicode_Check(o):
-            o = PyUnicode_AsUTF8String(o)
+            if not self.encoding:
+                raise TypeError("Can't encode utf-8 no encoding is specified")
+            o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors)
            rawval = o
            ret = msgpack_pack_raw(&self.pk, len(o))
            if ret == 0:
@@ -138,14 +158,14 @@ cdef class Packer(object):
        return buf


-def pack(object o, object stream, default=None):
+def pack(object o, object stream, default=None, encoding='utf-8', unicode_errors='strict'):
    """pack an object `o` and write it to stream)."""
-    packer = Packer(default=default)
+    packer = Packer(default=default, encoding=encoding, unicode_errors=unicode_errors)
    stream.write(packer.pack(o))

-def packb(object o, default=None):
+def packb(object o, default=None, encoding='utf-8', unicode_errors='strict'):
    """pack o and return packed bytes."""
-    packer = Packer(default=default)
+    packer = Packer(default=default, encoding=encoding, unicode_errors=unicode_errors)
    return packer.pack(o)

 dumps = packs = packb
@@ -155,6 +175,8 @@ cdef extern from "unpack.h":
        int use_list
        PyObject* object_hook
        PyObject* list_hook
+        char *encoding
+        char *unicode_errors

    ctypedef struct template_context:
        msgpack_user user
@@ -164,12 +186,12 @@ cdef extern from "unpack.h":
        PyObject* key

    int template_execute(template_context* ctx, const_char_ptr data,
-                         size_t len, size_t* off)
+                         size_t len, size_t* off) except -1
    void template_init(template_context* ctx)
    object template_data(template_context* ctx)


-def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=0):
+def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=0, encoding=None, unicode_errors="strict"):
    """Unpack packed_bytes to object. Returns an unpacked object."""
    cdef template_context ctx
    cdef size_t off = 0
@@ -179,9 +201,25 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint
    cdef Py_ssize_t buf_len
    PyObject_AsReadBuffer(packed, <const_void_ptr*>&buf, &buf_len)

+    if encoding is None:
+        enc = NULL
+    else:
+        if isinstance(encoding, unicode):
+            bencoding = encoding.encode('ascii')
+        else:
+            bencoding = encoding
+        if isinstance(unicode_errors, unicode):
+            berrors = unicode_errors.encode('ascii')
+        else:
+            berrors = unicode_errors
+        enc = PyBytes_AsString(bencoding)
+        err = PyBytes_AsString(berrors)
+
    template_init(&ctx)
    ctx.user.use_list = use_list
    ctx.user.object_hook = ctx.user.list_hook = NULL
+    ctx.user.encoding = enc
+    ctx.user.unicode_errors = err
    if object_hook is not None:
        if not PyCallable_Check(object_hook):
            raise TypeError("object_hook must be a callable.")
@@ -191,7 +229,9 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint
            raise TypeError("list_hook must be a callable.")
        ctx.user.list_hook = <PyObject*>list_hook
    _gc_disable()
+    try:
        ret = template_execute(&ctx, buf, buf_len, &off)
+    finally:
        _gc_enable()
    if ret == 1:
        return template_data(&ctx)
@@ -200,10 +240,10 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint

 loads = unpacks = unpackb

-def unpack(object stream, object object_hook=None, object list_hook=None, bint use_list=0):
+def unpack(object stream, object object_hook=None, object list_hook=None, bint use_list=0, encoding=None, unicode_errors="strict"):
    """unpack an object from stream."""
    return unpackb(stream.read(), use_list=use_list,
-                   object_hook=object_hook, list_hook=list_hook)
+                   object_hook=object_hook, list_hook=list_hook, encoding=encoding, unicode_errors=unicode_errors)

 cdef class Unpacker(object):
    """Unpacker(read_size=1024*1024)
@@ -236,7 +276,7 @@ cdef class Unpacker(object):
        self.buf = NULL;

    def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=0,
-                 object object_hook=None, object list_hook=None):
+                 object object_hook=None, object list_hook=None, encoding=None, unicode_errors=None):
        if read_size == 0:
            read_size = 1024*1024
        self.use_list = use_list
--- a/python/msgpack/unpack.h
+++ b/python/msgpack/unpack.h
@@ -23,6 +23,8 @@ typedef struct unpack_user {
    int use_list;
    PyObject *object_hook;
    PyObject *list_hook;
+    const char *encoding;
+    const char *unicode_errors;
 } unpack_user;


@@ -197,7 +199,11 @@ static inline int template_callback_map_end(unpack_user* u, msgpack_unpack_objec
 static inline int template_callback_raw(unpack_user* u, const char* b, const char* p, unsigned int l, msgpack_unpack_object* o)
 {
    PyObject *py;
+    if(u->encoding) {
+        py = PyUnicode_Decode(p, l, u->encoding, u->unicode_errors);
+    } else {
        py = PyBytes_FromStringAndSize(p, l);
+    }
    if (!py)
        return -1;
    *o = py;
--- a/python/test/test_pack.py
+++ b/python/test/test_pack.py
@@ -3,6 +3,7 @@

 from nose import main
 from nose.tools import *
+from nose.plugins.skip import SkipTest

 from msgpack import packs, unpacks

@@ -24,5 +25,58 @@ def testPack():
    for td in test_data:
        check(td)

+def testPackUnicode():
+    test_data = [
+        u"", u"abcd", (u"defgh",), u"Русский текст",
+        ]
+    for td in test_data:
+        re = unpacks(packs(td, encoding='utf-8'), encoding='utf-8')
+        assert_equal(re, td)
+
+def testPackUTF32():
+    try:
+        test_data = [
+            u"", u"abcd", (u"defgh",), u"Русский текст",
+            ]
+        for td in test_data:
+            re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32')
+            assert_equal(re, td)
+    except LookupError:
+        raise SkipTest
+
+def testPackBytes():
+    test_data = [
+        "", "abcd", ("defgh",),
+        ]
+    for td in test_data:
+        check(td)
+
+def testIgnoreUnicodeErrors():
+    re = unpacks(packs('abc\xeddef'),
+        encoding='ascii', unicode_errors='ignore')
+    assert_equal(re, "abcdef")
+
+@raises(UnicodeDecodeError)
+def testStrictUnicodeUnpack():
+    unpacks(packs('abc\xeddef'), encoding='utf-8')
+
+@raises(UnicodeEncodeError)
+def testStrictUnicodePack():
+    packs(u"abc\xeddef", encoding='ascii', unicode_errors='strict')
+
+def testIgnoreErrorsPack():
+    re = unpacks(
+            packs(u"abcФФФdef", encoding='ascii', unicode_errors='ignore'),
+            encoding='utf-8')
+    assert_equal(re, u"abcdef")
+
+@raises(TypeError)
+def testNoEncoding():
+    packs(u"abc", encoding=None)
+
+def testDecodeBinary():
+    re = unpacks(packs(u"abc"), encoding=None)
+    assert_equal(re, "abc")
+
 if __name__ == '__main__':
    main()
--- a/python/test/test_sequnpack.py
+++ b/python/test/test_sequnpack.py
@@ -6,12 +6,12 @@ from msgpack import Unpacker
 def test_foobar():
    unpacker = Unpacker(read_size=3)
    unpacker.feed('foobar')
-    assert unpacker.unpack() == ord(b'f')
-    assert unpacker.unpack() == ord(b'o')
-    assert unpacker.unpack() == ord(b'o')
-    assert unpacker.unpack() == ord(b'b')
-    assert unpacker.unpack() == ord(b'a')
-    assert unpacker.unpack() == ord(b'r')
+    assert unpacker.unpack() == ord('f')
+    assert unpacker.unpack() == ord('o')
+    assert unpacker.unpack() == ord('o')
+    assert unpacker.unpack() == ord('b')
+    assert unpacker.unpack() == ord('a')
+    assert unpacker.unpack() == ord('r')
    try:
        o = unpacker.unpack()
        print "Oops!", o
@@ -20,14 +20,14 @@ def test_foobar():
        assert 1
    else:
        assert 0
-    unpacker.feed(b'foo')
-    unpacker.feed(b'bar')
+    unpacker.feed('foo')
+    unpacker.feed('bar')

    k = 0
-    for o, e in zip(unpacker, b'foobarbaz'):
+    for o, e in zip(unpacker, 'foobarbaz'):
        assert o == ord(e)
        k += 1
-    assert k == len(b'foobar')
+    assert k == len('foobar')

 if __name__ == '__main__':
    test_foobar()
--- a/python/test3/test_obj.py
+++ b/python/test3/test_obj.py
@@ -26,7 +26,7 @@ def test_decode_hook():
    unpacked = unpacks(packed, object_hook=_decode_complex)
    eq_(unpacked[1], 1+2j)

-@raises(TypeError)
+@raises(ValueError)
 def test_bad_hook():
    packed = packs([3, 1+2j], default=lambda o: o)
    unpacked = unpacks(packed)
--- a/python/test3/test_pack.py
+++ b/python/test3/test_pack.py
@@ -24,5 +24,54 @@ def testPack():
    for td in test_data:
        check(td)

+def testPackUnicode():
+    test_data = [
+        "", "abcd", ("defgh",), "Русский текст",
+        ]
+    for td in test_data:
+        re = unpacks(packs(td, encoding='utf-8'), encoding='utf-8')
+        assert_equal(re, td)
+
+def testPackUTF32():
+    test_data = [
+        "", "abcd", ("defgh",), "Русский текст",
+        ]
+    for td in test_data:
+        print(packs(td, encoding='utf-32'))
+        re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32')
+        assert_equal(re, td)
+
+def testPackBytes():
+    test_data = [
+        b"", b"abcd", (b"defgh",),
+        ]
+    for td in test_data:
+        check(td)
+
+def testIgnoreUnicodeErrors():
+    re = unpacks(packs(b'abc\xeddef'),
+        encoding='utf-8', unicode_errors='ignore')
+    assert_equal(re, "abcdef")
+
+@raises(UnicodeDecodeError)
+def testStrictUnicodeUnpack():
+    unpacks(packs(b'abc\xeddef'), encoding='utf-8')
+
+@raises(UnicodeEncodeError)
+def testStrictUnicodePack():
+    packs("abc\xeddef", encoding='ascii', unicode_errors='strict')
+
+def testIgnoreErrorsPack():
+    re = unpacks(packs("abcФФФdef", encoding='ascii', unicode_errors='ignore'), encoding='utf-8')
+    assert_equal(re, "abcdef")
+
+@raises(TypeError)
+def testNoEncoding():
+    packs("abc", encoding=None)
+
+def testDecodeBinary():
+    re = unpacks(packs("abc"), encoding=None)
+    assert_equal(re, b"abc")
+
 if __name__ == '__main__':
    main()