Merge branch 'master' of git@github.com:msgpack/msgpack

2026-01-11 00:24:16 +01:00 · 2010-10-07 13:01:33 +09:00
parent 562d50df4d f3ee5ab372
commit 98eec72522
12 changed files with 130 additions and 11 deletions
--- a/cpp/bootstrap
+++ b/cpp/bootstrap
@@ -38,7 +38,8 @@ test -f ChangeLog || touch ChangeLog
 test -f NEWS      || touch NEWS
 test -f README    || cp -f README.md README

-if test ! ./preprocess; then
+./preprocess
+if [ $? -ne 0 ]; then
 	exit 1
 fi

--- a/perl/Changes
+++ b/perl/Changes
@@ -1,3 +1,12 @@
+
+0.30
+
+    - fix utf8 mode not to be reseted by $unpacker->reset method
+
+0.29
+
+    - add $unpacker->utf8 mode, decoding strings as UTF-8.
+
 0.28

    - added more tests(gfx)
--- a/perl/README
+++ b/perl/README
@@ -102,6 +102,11 @@ TODO
        "while(read($socket, $buffer, $arbitrary_buffer_size)) { ... }"). We
        should implement the internal buffer for the unpacker.

+    UTF8 mode
+        Data::MessagePack::Unpacker supports utf8 mode, which decodes
+        strings as UTF8-8. << Data::MessagePack->unpack >> should support
+        utf8 mode in a future.
+
 AUTHORS
    Tokuhiro Matsuno

--- a/perl/lib/Data/MessagePack.pm
+++ b/perl/lib/Data/MessagePack.pm
@@ -3,7 +3,7 @@ use strict;
 use warnings;
 use 5.008001;

-our $VERSION = '0.28';
+our $VERSION = '0.29';
 our $PreferInteger = 0;

 sub true () {
@@ -165,6 +165,12 @@ will astonish those who try to unpack byte streams with an arbitrary buffer size
 (e.g. C<< while(read($socket, $buffer, $arbitrary_buffer_size)) { ... } >>).
 We should implement the internal buffer for the unpacker.

+=item UTF8 mode
+
+Data::MessagePack::Unpacker supports utf8 mode, which decodes strings
+as UTF8-8. << Data::MessagePack->unpack >> should support utf8 mode in a
+future.
+
 =back

 =head1 AUTHORS
--- a/perl/lib/Data/MessagePack/PP.pm
+++ b/perl/lib/Data/MessagePack/PP.pm
@@ -248,6 +248,7 @@ sub _pack {
 # UNPACK
 #

+our $_utf8 = 0;
 my $p; # position variables for speed.

 sub unpack :method {
@@ -358,7 +359,9 @@ sub _unpack {
            $num = $byte & ~0xa0;
            $p += $num;
        }
-        return substr( $value, $p - $num, $num );
+        my $s = substr( $value, $p - $num, $num );
+        utf8::decode($s) if $_utf8;
+        return $s;
    }

    elsif ( $byte == 0xc0 ) { # nil
@@ -396,9 +399,19 @@ package
    Data::MessagePack::PP::Unpacker;

 sub new {
-    bless { pos => 0 }, shift;
+    bless { pos => 0, utf8 => 0 }, shift;
 }

+sub utf8 {
+    my $self = shift;
+    $self->{utf8} = (@_ ? shift : 1);
+    return $self;
+}
+
+sub get_utf8 {
+    my($self) = @_;
+    return $self->{utf8};
+}

 sub execute_limit {
    execute( @_ );
@@ -540,7 +553,9 @@ sub _count {


 sub data {
-    return Data::MessagePack->unpack( substr($_[0]->{ data }, 0, $_[0]->{pos}) );
+    my($self) = @_;
+    local $Data::MessagePack::PP::_utf8 = $self->{utf8};
+    return Data::MessagePack->unpack( substr($self->{ data }, 0, $self->{pos}) );
 }


--- a/perl/lib/Data/MessagePack/Unpacker.pod
+++ b/perl/lib/Data/MessagePack/Unpacker.pod
@@ -24,6 +24,19 @@ This is a streaming deserializer for messagepack.

 creates a new instance of stream deserializer.

+=item $up->utf8([$bool])
+
+sets utf8 mode. true if I<$bool> is omitted.
+returns I<$up> itself.
+
+If utf8 mode is enabled, strings will be decoded as UTF-8.
+
+The utf8 mode is disabled by default.
+
+=item my $ret = $up->get_utf8()
+
+returns the utf8 mode flag of I<$up>.
+
 =item my $ret = $up->execute($data, $offset);

 =item my $ret = $up->execute_limit($data, $offset, $limit)
--- a/perl/t/15_utf8.t
+++ b/perl/t/15_utf8.t
@@ -0,0 +1,33 @@
+#!perl -w
+use strict;
+use Test::More;
+use Data::MessagePack;
+use utf8;
+
+my $data = [42, undef, 'foo', "\x{99f1}\x{99dd}"];
+my $packed = Data::MessagePack->pack($data) x 2;
+
+my $u = Data::MessagePack::Unpacker->new()->utf8();
+my $p = 0;
+for(1 .. 2) {
+    ok $u->get_utf8();
+    $p = $u->execute($packed, $p);
+    my $d = $u->data();
+    $u->reset();
+    is_deeply $d, $data, 'decoded';
+}
+
+is $u->utf8(0), $u, 'utf8(0)';
+$p = 0;
+for(1 .. 2) {
+    ok !$u->get_utf8();
+    $p = $u->execute($packed, $p);
+    my $d = $u->data();
+    $u->reset();
+    my $s = $data->[3];
+    utf8::encode($s);
+    is_deeply $d->[3], $s, 'not decoded';
+}
+
+done_testing;
+
--- a/perl/xs-src/MessagePack.c
+++ b/perl/xs-src/MessagePack.c
@@ -7,6 +7,8 @@
 XS(xs_pack);
 XS(xs_unpack);
 XS(xs_unpacker_new);
+XS(xs_unpacker_utf8);
+XS(xs_unpacker_get_utf8);
 XS(xs_unpacker_execute);
 XS(xs_unpacker_execute_limit);
 XS(xs_unpacker_is_finished);
@@ -28,6 +30,8 @@ XS(boot_Data__MessagePack) {
    newXS("Data::MessagePack::unpack", xs_unpack, __FILE__);

    newXS("Data::MessagePack::Unpacker::new",           xs_unpacker_new, __FILE__);
+    newXS("Data::MessagePack::Unpacker::utf8",          xs_unpacker_utf8, __FILE__);
+    newXS("Data::MessagePack::Unpacker::get_utf8",      xs_unpacker_get_utf8, __FILE__);
    newXS("Data::MessagePack::Unpacker::execute",       xs_unpacker_execute, __FILE__);
    newXS("Data::MessagePack::Unpacker::execute_limit", xs_unpacker_execute_limit, __FILE__);
    newXS("Data::MessagePack::Unpacker::is_finished",   xs_unpacker_is_finished, __FILE__);
--- a/perl/xs-src/unpack.c
+++ b/perl/xs-src/unpack.c
@@ -13,6 +13,7 @@ START_MY_CXT
 typedef struct {
    bool finished;
    bool incremented;
+    bool utf8;
 } unpack_user;

 #include "msgpack/unpack_define.h"
@@ -237,6 +238,9 @@ STATIC_INLINE int template_callback_raw(unpack_user* u PERL_UNUSED_DECL, const c
    dTHX;
    /*  newSVpvn(p, l) returns an undef if p == NULL */
    *o = ((l==0) ? newSVpvs("") : newSVpvn(p, l));
+    if(u->utf8) {
+        sv_utf8_decode(*o);
+    }
    return 0;
 }

@@ -276,7 +280,7 @@ XS(xs_unpack) {
    msgpack_unpack_t mp;
    template_init(&mp);

-    unpack_user const u = {false, false};
+    unpack_user const u = {false, false, false};
    mp.user = u;

    size_t from = 0;
@@ -303,7 +307,7 @@ XS(xs_unpack) {

 STATIC_INLINE void _reset(SV* const self) {
    dTHX;
-	unpack_user const u = {false, false};
+	unpack_user const u = {false, false, false};

 	UNPACKER(self, mp);
 	template_init(mp);
@@ -328,6 +332,26 @@ XS(xs_unpacker_new) {
    XSRETURN(1);
 }

+XS(xs_unpacker_utf8) {
+    dXSARGS;
+    if (!(items == 1 || items == 2)) {
+        Perl_croak(aTHX_ "Usage: $unpacker->utf8([$bool)");
+    }
+    UNPACKER(ST(0), mp);
+    mp->user.utf8 = (items == 1 || sv_true(ST(1))) ? true : false;
+    XSRETURN(1); // returns $self
+}
+
+XS(xs_unpacker_get_utf8) {
+    dXSARGS;
+    if (items != 1) {
+        Perl_croak(aTHX_ "Usage: $unpacker->get_utf8()");
+    }
+    UNPACKER(ST(0), mp);
+    ST(0) = boolSV(mp->user.utf8);
+    XSRETURN(1);
+}
+
 STATIC_INLINE size_t
 _execute_impl(SV* const self, SV* const data, UV const offset, UV const limit) {
    dTHX;
@@ -419,10 +443,12 @@ XS(xs_unpacker_reset) {
    }

    UNPACKER(ST(0), mp);
+    bool const utf8 = mp->user.utf8; // save

    SV* const data = template_data(mp);
    SvREFCNT_dec(data);
    _reset(ST(0));
+    mp->user.utf8 = utf8;

    XSRETURN(0);
 }
--- a/python/msgpack/_msgpack.pyx
+++ b/python/msgpack/_msgpack.pyx
@@ -7,6 +7,7 @@ cdef extern from "Python.h":
    cdef object PyBytes_FromStringAndSize(const_char_ptr b, Py_ssize_t len)
    cdef PyObject* Py_True
    cdef PyObject* Py_False
+    cdef object PyUnicode_AsUTF8String(object)

    cdef long long PyLong_AsLongLong(object o)
    cdef unsigned long long PyLong_AsUnsignedLongLong(object o)
@@ -105,7 +106,7 @@ cdef class Packer(object):
            if ret == 0:
                ret = msgpack_pack_raw_body(&self.pk, rawval, len(o))
        elif PyUnicode_Check(o):
-            o = o.encode('utf-8')
+            o = PyUnicode_AsUTF8String(o)
            rawval = o
            ret = msgpack_pack_raw(&self.pk, len(o))
            if ret == 0:
@@ -169,7 +170,7 @@ cdef extern from "unpack.h":
    object template_data(template_context* ctx)


-def unpackb(object packed_bytes):
+def unpackb(bytes packed_bytes):
    """Unpack packed_bytes to object. Returns an unpacked object."""
    cdef const_char_ptr p = packed_bytes
    cdef template_context ctx
@@ -232,7 +233,7 @@ cdef class Unpacker(object):
    cdef object file_like
    cdef int read_size
    cdef object waiting_bytes
-    cdef int use_list
+    cdef bint use_list

    def __cinit__(self):
        self.buf = NULL
@@ -241,7 +242,7 @@ cdef class Unpacker(object):
        if self.buf:
            free(self.buf);

-    def __init__(self, file_like=None, int read_size=0, use_list=0):
+    def __init__(self, file_like=None, int read_size=0, bint use_list=0):
        if read_size == 0:
            read_size = 1024*1024
        self.use_list = use_list
--- a/python/test/test_case.py
+++ b/python/test/test_case.py
@@ -98,5 +98,8 @@ def test_match():
    for v, p in cases:
        match(v, p)

+def test_unicode():
+    assert_equal('foobar', unpacks(packs(u'foobar')))
+
 if __name__ == '__main__':
    main()
--- a/python/test3/test_case.py
+++ b/python/test3/test_case.py
@@ -98,5 +98,8 @@ def test_match():
    for v, p in cases:
        match(v, p)

+def test_unicode():
+    assert_equal(b'foobar', unpacks(packs('foobar')))
+
 if __name__ == '__main__':
    main()