From 8cb5ccad99aa4660dbace9025a8e1c0223358f08 Mon Sep 17 00:00:00 2001
From: tailhook <pc@gafol.net>
Date: Fri, 15 Apr 2011 17:36:17 +0300
Subject: [PATCH 1/4] Implemented encoding for strings

* Packer by default uses `utf-8` encoding by default
* Unpacker uses `None` by default, so no decoding is done
* Both pack and unpack has `encoding` and `unicode_errors` arguments,
  if `encoding` is `None` no encoding/decoding is done, otherwise
  it is python codec. `unicode_errors` is supplied as `errors`
  parameter to codec
---
 python/msgpack/_msgpack.pyx | 72 ++++++++++++++++++++++++++++---------
 python/msgpack/unpack.h     |  8 ++++-
 python/test/test_pack.py    | 57 ++++++++++++++++++++++++++---
 python/test3/test_obj.py    |  2 +-
 python/test3/test_pack.py   | 55 ++++++++++++++++++++++++++--
 5 files changed, 169 insertions(+), 25 deletions(-)

diff --git a/python/msgpack/_msgpack.pyx b/python/msgpack/_msgpack.pyx
index cdcd0c81..443cbd75 100644
--- a/python/msgpack/_msgpack.pyx
+++ b/python/msgpack/_msgpack.pyx
@@ -36,7 +36,7 @@ cdef int DEFAULT_RECURSE_LIMIT=511
 
 cdef class Packer(object):
     """MessagePack Packer
-    
+
     usage:
 
         packer = Packer()
@@ -45,6 +45,10 @@ cdef class Packer(object):
     """
     cdef msgpack_packer pk
     cdef object _default
+    cdef object _bencoding
+    cdef object _berrors
+    cdef char *encoding
+    cdef char *unicode_errors
 
     def __cinit__(self):
         cdef int buf_size = 1024*1024
@@ -54,11 +58,25 @@ cdef class Packer(object):
         self.pk.buf_size = buf_size
         self.pk.length = 0
 
-    def __init__(self, default=None):
+    def __init__(self, default=None, encoding='utf-8', unicode_errors='strict'):
         if default is not None:
             if not PyCallable_Check(default):
                 raise TypeError("default must be a callable.")
         self._default = default
+        if encoding is None:
+            self.encoding = NULL
+            self.unicode_errors = NULL
+        else:
+            if isinstance(encoding, unicode):
+                self._bencoding = encoding.encode('ascii')
+            else:
+                self._bencoding = encoding
+            self.encoding = PyBytes_AsString(self._bencoding)
+            if isinstance(unicode_errors, unicode):
+                self._berrors = unicode_errors.encode('ascii')
+            else:
+                self._berrors = unicode_errors
+            self.unicode_errors = PyBytes_AsString(self._berrors)
 
     def __dealloc__(self):
         free(self.pk.buf);
@@ -68,7 +86,7 @@ cdef class Packer(object):
         cdef unsigned long long ullval
         cdef long longval
         cdef double fval
-        cdef char* rawval 
+        cdef char* rawval
         cdef int ret
         cdef dict d
 
@@ -101,7 +119,9 @@ cdef class Packer(object):
             if ret == 0:
                 ret = msgpack_pack_raw_body(&self.pk, rawval, len(o))
         elif PyUnicode_Check(o):
-            o = PyUnicode_AsUTF8String(o)
+            if not self.encoding:
+                raise TypeError("Can't encode utf-8 no encoding is specified")
+            o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors)
             rawval = o
             ret = msgpack_pack_raw(&self.pk, len(o))
             if ret == 0:
@@ -138,14 +158,14 @@ cdef class Packer(object):
         return buf
 
 
-def pack(object o, object stream, default=None):
+def pack(object o, object stream, default=None, encoding='utf-8', unicode_errors='strict'):
     """pack an object `o` and write it to stream)."""
-    packer = Packer(default=default)
+    packer = Packer(default=default, encoding=encoding, unicode_errors=unicode_errors)
     stream.write(packer.pack(o))
 
-def packb(object o, default=None):
+def packb(object o, default=None, encoding='utf-8', unicode_errors='strict'):
     """pack o and return packed bytes."""
-    packer = Packer(default=default)
+    packer = Packer(default=default, encoding=encoding, unicode_errors=unicode_errors)
     return packer.pack(o)
 
 dumps = packs = packb
@@ -155,6 +175,8 @@ cdef extern from "unpack.h":
         int use_list
         PyObject* object_hook
         PyObject* list_hook
+        char *encoding
+        char *unicode_errors
 
     ctypedef struct template_context:
         msgpack_user user
@@ -164,12 +186,12 @@ cdef extern from "unpack.h":
         PyObject* key
 
     int template_execute(template_context* ctx, const_char_ptr data,
-                         size_t len, size_t* off)
+                         size_t len, size_t* off) except -1
     void template_init(template_context* ctx)
     object template_data(template_context* ctx)
 
 
-def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=0):
+def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=0, encoding=None, unicode_errors="strict"):
     """Unpack packed_bytes to object. Returns an unpacked object."""
     cdef template_context ctx
     cdef size_t off = 0
@@ -179,9 +201,25 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint
     cdef Py_ssize_t buf_len
     PyObject_AsReadBuffer(packed, <const_void_ptr*>&buf, &buf_len)
 
+    if encoding is None:
+        enc = NULL
+    else:
+        if isinstance(encoding, unicode):
+            bencoding = encoding.encode('ascii')
+        else:
+            bencoding = encoding
+        if isinstance(unicode_errors, unicode):
+            berrors = unicode_errors.encode('ascii')
+        else:
+            berrors = unicode_errors
+        enc = PyBytes_AsString(bencoding)
+        err = PyBytes_AsString(berrors)
+
     template_init(&ctx)
     ctx.user.use_list = use_list
     ctx.user.object_hook = ctx.user.list_hook = NULL
+    ctx.user.encoding = enc
+    ctx.user.unicode_errors = err
     if object_hook is not None:
         if not PyCallable_Check(object_hook):
             raise TypeError("object_hook must be a callable.")
@@ -191,8 +229,10 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint
             raise TypeError("list_hook must be a callable.")
         ctx.user.list_hook = <PyObject*>list_hook
     _gc_disable()
-    ret = template_execute(&ctx, buf, buf_len, &off)
-    _gc_enable()
+    try:
+        ret = template_execute(&ctx, buf, buf_len, &off)
+    finally:
+        _gc_enable()
     if ret == 1:
         return template_data(&ctx)
     else:
@@ -200,10 +240,10 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint
 
 loads = unpacks = unpackb
 
-def unpack(object stream, object object_hook=None, object list_hook=None, bint use_list=0):
+def unpack(object stream, object object_hook=None, object list_hook=None, bint use_list=0, encoding=None, unicode_errors="strict"):
     """unpack an object from stream."""
     return unpackb(stream.read(), use_list=use_list,
-                   object_hook=object_hook, list_hook=list_hook)
+                   object_hook=object_hook, list_hook=list_hook, encoding=encoding, unicode_errors=unicode_errors)
 
 cdef class Unpacker(object):
     """Unpacker(read_size=1024*1024)
@@ -236,7 +276,7 @@ cdef class Unpacker(object):
         self.buf = NULL;
 
     def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=0,
-                 object object_hook=None, object list_hook=None):
+                 object object_hook=None, object list_hook=None, encoding=None, unicode_errors=None):
         if read_size == 0:
             read_size = 1024*1024
         self.use_list = use_list
@@ -292,7 +332,7 @@ cdef class Unpacker(object):
                 new_size = tail + _buf_len
                 if new_size < buf_size*2:
                     new_size = buf_size*2
-                buf = <char*>realloc(buf, new_size) 
+                buf = <char*>realloc(buf, new_size)
                 if buf == NULL:
                     # self.buf still holds old buffer and will be freed during
                     # obj destruction
diff --git a/python/msgpack/unpack.h b/python/msgpack/unpack.h
index 453ec2b8..0586ca86 100644
--- a/python/msgpack/unpack.h
+++ b/python/msgpack/unpack.h
@@ -23,6 +23,8 @@ typedef struct unpack_user {
     int use_list;
     PyObject *object_hook;
     PyObject *list_hook;
+    const char *encoding;
+    const char *unicode_errors;
 } unpack_user;
 
 
@@ -197,7 +199,11 @@ static inline int template_callback_map_end(unpack_user* u, msgpack_unpack_objec
 static inline int template_callback_raw(unpack_user* u, const char* b, const char* p, unsigned int l, msgpack_unpack_object* o)
 {
     PyObject *py;
-    py = PyBytes_FromStringAndSize(p, l);
+    if(u->encoding) {
+        py = PyUnicode_Decode(p, l, u->encoding, u->unicode_errors);
+    } else {
+        py = PyBytes_FromStringAndSize(p, l);
+    }
     if (!py)
         return -1;
     *o = py;
diff --git a/python/test/test_pack.py b/python/test/test_pack.py
index 5dec0680..2aef588b 100644
--- a/python/test/test_pack.py
+++ b/python/test/test_pack.py
@@ -15,14 +15,63 @@ def testPack():
             0, 1, 127, 128, 255, 256, 65535, 65536,
             -1, -32, -33, -128, -129, -32768, -32769,
             1.0,
-        "", "a", "a"*31, "a"*32,
+        b"", b"a", b"a"*31, b"a"*32,
         None, True, False,
-        (), ((),), ((), None,), 
-        {None: 0}, 
-        (1<<23), 
+        (), ((),), ((), None,),
+        {None: 0},
+        (1<<23),
         ]
     for td in test_data:
         check(td)
 
+def testPackUnicode():
+    test_data = [
+        u"", u"abcd", (u"defgh",), u"Русский текст",
+        ]
+    for td in test_data:
+        re = unpacks(packs(td, encoding='utf-8'), encoding='utf-8')
+        assert_equal(re, td)
+
+def testPackUTF32():
+    test_data = [
+        u"", u"abcd", (u"defgh",), u"Русский текст",
+        ]
+    for td in test_data:
+        print(packs(td, encoding='utf-32'))
+        re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32')
+        assert_equal(re, td)
+
+def testPackBytes():
+    test_data = [
+        b"", b"abcd", (b"defgh",),
+        ]
+    for td in test_data:
+        check(td)
+
+def testIgnoreUnicodeErrors():
+    re = unpacks(packs(b'abc\xeddef'),
+        encoding='utf-8', unicode_errors='ignore')
+    assert_equal(re, "abcdef")
+
+@raises(UnicodeDecodeError)
+def testStrictUnicodeUnpack():
+    unpacks(packs(b'abc\xeddef'), encoding='utf-8')
+
+@raises(UnicodeEncodeError)
+def testStrictUnicodePack():
+    packs(u"abc\xeddef", encoding='ascii', unicode_errors='strict')
+
+def testIgnoreErrorsPack():
+    re = unpacks(packs(u"abcФФФdef", encoding='ascii', unicode_errors='ignore'), encoding='utf-8')
+    assert_equal(re, u"abcdef")
+
+@raises(TypeError)
+def testNoEncoding():
+    packs(u"abc", encoding=None)
+
+def testDecodeBinary():
+    re = unpacks(packs(u"abc"), encoding=None)
+    assert_equal(re, b"abc")
+
 if __name__ == '__main__':
     main()
diff --git a/python/test3/test_obj.py b/python/test3/test_obj.py
index 236988de..b54021f2 100644
--- a/python/test3/test_obj.py
+++ b/python/test3/test_obj.py
@@ -26,7 +26,7 @@ def test_decode_hook():
     unpacked = unpacks(packed, object_hook=_decode_complex)
     eq_(unpacked[1], 1+2j)
 
-@raises(TypeError)
+@raises(ValueError)
 def test_bad_hook():
     packed = packs([3, 1+2j], default=lambda o: o)
     unpacked = unpacks(packed)
diff --git a/python/test3/test_pack.py b/python/test3/test_pack.py
index c861704b..e53f7e64 100644
--- a/python/test3/test_pack.py
+++ b/python/test3/test_pack.py
@@ -17,12 +17,61 @@ def testPack():
             1.0,
         b"", b"a", b"a"*31, b"a"*32,
         None, True, False,
-        (), ((),), ((), None,), 
-        {None: 0}, 
-        (1<<23), 
+        (), ((),), ((), None,),
+        {None: 0},
+        (1<<23),
         ]
     for td in test_data:
         check(td)
 
+def testPackUnicode():
+    test_data = [
+        "", "abcd", ("defgh",), "Русский текст",
+        ]
+    for td in test_data:
+        re = unpacks(packs(td, encoding='utf-8'), encoding='utf-8')
+        assert_equal(re, td)
+
+def testPackUTF32():
+    test_data = [
+        "", "abcd", ("defgh",), "Русский текст",
+        ]
+    for td in test_data:
+        print(packs(td, encoding='utf-32'))
+        re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32')
+        assert_equal(re, td)
+
+def testPackBytes():
+    test_data = [
+        b"", b"abcd", (b"defgh",),
+        ]
+    for td in test_data:
+        check(td)
+
+def testIgnoreUnicodeErrors():
+    re = unpacks(packs(b'abc\xeddef'),
+        encoding='utf-8', unicode_errors='ignore')
+    assert_equal(re, "abcdef")
+
+@raises(UnicodeDecodeError)
+def testStrictUnicodeUnpack():
+    unpacks(packs(b'abc\xeddef'), encoding='utf-8')
+
+@raises(UnicodeEncodeError)
+def testStrictUnicodePack():
+    packs("abc\xeddef", encoding='ascii', unicode_errors='strict')
+
+def testIgnoreErrorsPack():
+    re = unpacks(packs("abcФФФdef", encoding='ascii', unicode_errors='ignore'), encoding='utf-8')
+    assert_equal(re, "abcdef")
+
+@raises(TypeError)
+def testNoEncoding():
+    packs("abc", encoding=None)
+
+def testDecodeBinary():
+    re = unpacks(packs("abc"), encoding=None)
+    assert_equal(re, b"abc")
+
 if __name__ == '__main__':
     main()

From bd73742552cf16592662a7ec5ba3608888081131 Mon Sep 17 00:00:00 2001
From: INADA Naoki <songofacandy@gmail.com>
Date: Tue, 31 May 2011 14:10:46 +0900
Subject: [PATCH 2/4] (python) Change error message for unicode is passed but
 no encoding is specified.

---
 python/msgpack/_msgpack.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/msgpack/_msgpack.pyx b/python/msgpack/_msgpack.pyx
index 443cbd75..14bc9d77 100644
--- a/python/msgpack/_msgpack.pyx
+++ b/python/msgpack/_msgpack.pyx
@@ -120,7 +120,7 @@ cdef class Packer(object):
                 ret = msgpack_pack_raw_body(&self.pk, rawval, len(o))
         elif PyUnicode_Check(o):
             if not self.encoding:
-                raise TypeError("Can't encode utf-8 no encoding is specified")
+                raise TypeError("Can't pack unicode object: No encoding is specified")
             o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors)
             rawval = o
             ret = msgpack_pack_raw(&self.pk, len(o))

From 709d0cc33e7ac5c2029bca17ee2e4b0e4b1df55d Mon Sep 17 00:00:00 2001
From: INADA Naoki <songofacandy@gmail.com>
Date: Tue, 31 May 2011 15:40:11 +0900
Subject: [PATCH 3/4] Revert "(python) Change error message for unicode is
 passed but no encoding is"

This reverts commit bd73742552cf16592662a7ec5ba3608888081131.
---
 python/ChangeLog.rst        | 10 ++++++++++
 python/msgpack/_msgpack.pyx |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/ChangeLog.rst b/python/ChangeLog.rst
index 75e86b27..a0aae257 100644
--- a/python/ChangeLog.rst
+++ b/python/ChangeLog.rst
@@ -1,3 +1,13 @@
+0.1.10
+======
+:release date: NOT RELEASED YET
+
+New feature
+-----------
+* Add ``encoding`` and ``unicode_erros`` option to packer and unpacker.
+  When this option is specified, (un)packs unicode object instead of bytes.
+  This enables using msgpack as a replacement of json.
+
 0.1.9
 ======
 :release date: 2011-01-29
diff --git a/python/msgpack/_msgpack.pyx b/python/msgpack/_msgpack.pyx
index 14bc9d77..443cbd75 100644
--- a/python/msgpack/_msgpack.pyx
+++ b/python/msgpack/_msgpack.pyx
@@ -120,7 +120,7 @@ cdef class Packer(object):
                 ret = msgpack_pack_raw_body(&self.pk, rawval, len(o))
         elif PyUnicode_Check(o):
             if not self.encoding:
-                raise TypeError("Can't pack unicode object: No encoding is specified")
+                raise TypeError("Can't encode utf-8 no encoding is specified")
             o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors)
             rawval = o
             ret = msgpack_pack_raw(&self.pk, len(o))

From be6d6560a7d0b1da6545ca8fff8a477348fdc52a Mon Sep 17 00:00:00 2001
From: inada-n <inada-n@sag16.klab.org>
Date: Wed, 1 Jun 2011 18:30:43 +0900
Subject: [PATCH 4/4] (python) make test pass with Python 2.5

---
 python/test/test_pack.py      | 33 +++++++++++++++++++--------------
 python/test/test_sequnpack.py | 20 ++++++++++----------
 2 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/python/test/test_pack.py b/python/test/test_pack.py
index 2aef588b..2b5f1ade 100644
--- a/python/test/test_pack.py
+++ b/python/test/test_pack.py
@@ -3,6 +3,7 @@
 
 from nose import main
 from nose.tools import *
+from nose.plugins.skip import SkipTest
 
 from msgpack import packs, unpacks
 
@@ -15,7 +16,7 @@ def testPack():
             0, 1, 127, 128, 255, 256, 65535, 65536,
             -1, -32, -33, -128, -129, -32768, -32769,
             1.0,
-        b"", b"a", b"a"*31, b"a"*32,
+        "", "a", "a"*31, "a"*32,
         None, True, False,
         (), ((),), ((), None,),
         {None: 0},
@@ -33,36 +34,40 @@ def testPackUnicode():
         assert_equal(re, td)
 
 def testPackUTF32():
-    test_data = [
-        u"", u"abcd", (u"defgh",), u"Русский текст",
-        ]
-    for td in test_data:
-        print(packs(td, encoding='utf-32'))
-        re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32')
-        assert_equal(re, td)
+    try:
+        test_data = [
+            u"", u"abcd", (u"defgh",), u"Русский текст",
+            ]
+        for td in test_data:
+            re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32')
+            assert_equal(re, td)
+    except LookupError:
+        raise SkipTest
 
 def testPackBytes():
     test_data = [
-        b"", b"abcd", (b"defgh",),
+        "", "abcd", ("defgh",),
         ]
     for td in test_data:
         check(td)
 
 def testIgnoreUnicodeErrors():
-    re = unpacks(packs(b'abc\xeddef'),
-        encoding='utf-8', unicode_errors='ignore')
+    re = unpacks(packs('abc\xeddef'),
+        encoding='ascii', unicode_errors='ignore')
     assert_equal(re, "abcdef")
 
 @raises(UnicodeDecodeError)
 def testStrictUnicodeUnpack():
-    unpacks(packs(b'abc\xeddef'), encoding='utf-8')
+    unpacks(packs('abc\xeddef'), encoding='utf-8')
 
 @raises(UnicodeEncodeError)
 def testStrictUnicodePack():
     packs(u"abc\xeddef", encoding='ascii', unicode_errors='strict')
 
 def testIgnoreErrorsPack():
-    re = unpacks(packs(u"abcФФФdef", encoding='ascii', unicode_errors='ignore'), encoding='utf-8')
+    re = unpacks(
+            packs(u"abcФФФdef", encoding='ascii', unicode_errors='ignore'),
+            encoding='utf-8')
     assert_equal(re, u"abcdef")
 
 @raises(TypeError)
@@ -71,7 +76,7 @@ def testNoEncoding():
 
 def testDecodeBinary():
     re = unpacks(packs(u"abc"), encoding=None)
-    assert_equal(re, b"abc")
+    assert_equal(re, "abc")
 
 if __name__ == '__main__':
     main()
diff --git a/python/test/test_sequnpack.py b/python/test/test_sequnpack.py
index c92658c1..d61be230 100644
--- a/python/test/test_sequnpack.py
+++ b/python/test/test_sequnpack.py
@@ -6,12 +6,12 @@ from msgpack import Unpacker
 def test_foobar():
     unpacker = Unpacker(read_size=3)
     unpacker.feed('foobar')
-    assert unpacker.unpack() == ord(b'f')
-    assert unpacker.unpack() == ord(b'o')
-    assert unpacker.unpack() == ord(b'o')
-    assert unpacker.unpack() == ord(b'b')
-    assert unpacker.unpack() == ord(b'a')
-    assert unpacker.unpack() == ord(b'r')
+    assert unpacker.unpack() == ord('f')
+    assert unpacker.unpack() == ord('o')
+    assert unpacker.unpack() == ord('o')
+    assert unpacker.unpack() == ord('b')
+    assert unpacker.unpack() == ord('a')
+    assert unpacker.unpack() == ord('r')
     try:
         o = unpacker.unpack()
         print "Oops!", o
@@ -20,14 +20,14 @@ def test_foobar():
         assert 1
     else:
         assert 0
-    unpacker.feed(b'foo')
-    unpacker.feed(b'bar')
+    unpacker.feed('foo')
+    unpacker.feed('bar')
 
     k = 0
-    for o, e in zip(unpacker, b'foobarbaz'):
+    for o, e in zip(unpacker, 'foobarbaz'):
         assert o == ord(e)
         k += 1
-    assert k == len(b'foobar')
+    assert k == len('foobar')
 
 if __name__ == '__main__':
     test_foobar()