diff --git a/cpp/bootstrap b/cpp/bootstrap index 7f3a182a..1ff6b76e 100755 --- a/cpp/bootstrap +++ b/cpp/bootstrap @@ -38,7 +38,8 @@ test -f ChangeLog || touch ChangeLog test -f NEWS || touch NEWS test -f README || cp -f README.md README -if test ! ./preprocess; then +./preprocess +if [ $? -ne 0 ]; then exit 1 fi diff --git a/perl/Changes b/perl/Changes index b506234b..091c875a 100644 --- a/perl/Changes +++ b/perl/Changes @@ -1,3 +1,12 @@ + +0.30 + + - fix utf8 mode not to be reseted by $unpacker->reset method + +0.29 + + - add $unpacker->utf8 mode, decoding strings as UTF-8. + 0.28 - added more tests(gfx) diff --git a/perl/README b/perl/README index 224ff08c..ae924ee6 100644 --- a/perl/README +++ b/perl/README @@ -102,6 +102,11 @@ TODO "while(read($socket, $buffer, $arbitrary_buffer_size)) { ... }"). We should implement the internal buffer for the unpacker. + UTF8 mode + Data::MessagePack::Unpacker supports utf8 mode, which decodes + strings as UTF8-8. << Data::MessagePack->unpack >> should support + utf8 mode in a future. + AUTHORS Tokuhiro Matsuno diff --git a/perl/lib/Data/MessagePack.pm b/perl/lib/Data/MessagePack.pm index 7d1bda78..01e0fc05 100644 --- a/perl/lib/Data/MessagePack.pm +++ b/perl/lib/Data/MessagePack.pm @@ -3,7 +3,7 @@ use strict; use warnings; use 5.008001; -our $VERSION = '0.28'; +our $VERSION = '0.29'; our $PreferInteger = 0; sub true () { @@ -165,6 +165,12 @@ will astonish those who try to unpack byte streams with an arbitrary buffer size (e.g. C<< while(read($socket, $buffer, $arbitrary_buffer_size)) { ... } >>). We should implement the internal buffer for the unpacker. +=item UTF8 mode + +Data::MessagePack::Unpacker supports utf8 mode, which decodes strings +as UTF8-8. << Data::MessagePack->unpack >> should support utf8 mode in a +future. + =back =head1 AUTHORS diff --git a/perl/lib/Data/MessagePack/PP.pm b/perl/lib/Data/MessagePack/PP.pm index 5dccc0bb..00e58b92 100644 --- a/perl/lib/Data/MessagePack/PP.pm +++ b/perl/lib/Data/MessagePack/PP.pm @@ -248,6 +248,7 @@ sub _pack { # UNPACK # +our $_utf8 = 0; my $p; # position variables for speed. sub unpack :method { @@ -358,7 +359,9 @@ sub _unpack { $num = $byte & ~0xa0; $p += $num; } - return substr( $value, $p - $num, $num ); + my $s = substr( $value, $p - $num, $num ); + utf8::decode($s) if $_utf8; + return $s; } elsif ( $byte == 0xc0 ) { # nil @@ -396,9 +399,19 @@ package Data::MessagePack::PP::Unpacker; sub new { - bless { pos => 0 }, shift; + bless { pos => 0, utf8 => 0 }, shift; } +sub utf8 { + my $self = shift; + $self->{utf8} = (@_ ? shift : 1); + return $self; +} + +sub get_utf8 { + my($self) = @_; + return $self->{utf8}; +} sub execute_limit { execute( @_ ); @@ -540,7 +553,9 @@ sub _count { sub data { - return Data::MessagePack->unpack( substr($_[0]->{ data }, 0, $_[0]->{pos}) ); + my($self) = @_; + local $Data::MessagePack::PP::_utf8 = $self->{utf8}; + return Data::MessagePack->unpack( substr($self->{ data }, 0, $self->{pos}) ); } diff --git a/perl/lib/Data/MessagePack/Unpacker.pod b/perl/lib/Data/MessagePack/Unpacker.pod index 2bc4549c..24dafd00 100644 --- a/perl/lib/Data/MessagePack/Unpacker.pod +++ b/perl/lib/Data/MessagePack/Unpacker.pod @@ -24,6 +24,19 @@ This is a streaming deserializer for messagepack. creates a new instance of stream deserializer. +=item $up->utf8([$bool]) + +sets utf8 mode. true if I<$bool> is omitted. +returns I<$up> itself. + +If utf8 mode is enabled, strings will be decoded as UTF-8. + +The utf8 mode is disabled by default. + +=item my $ret = $up->get_utf8() + +returns the utf8 mode flag of I<$up>. + =item my $ret = $up->execute($data, $offset); =item my $ret = $up->execute_limit($data, $offset, $limit) diff --git a/perl/t/15_utf8.t b/perl/t/15_utf8.t new file mode 100644 index 00000000..f3163dfa --- /dev/null +++ b/perl/t/15_utf8.t @@ -0,0 +1,33 @@ +#!perl -w +use strict; +use Test::More; +use Data::MessagePack; +use utf8; + +my $data = [42, undef, 'foo', "\x{99f1}\x{99dd}"]; +my $packed = Data::MessagePack->pack($data) x 2; + +my $u = Data::MessagePack::Unpacker->new()->utf8(); +my $p = 0; +for(1 .. 2) { + ok $u->get_utf8(); + $p = $u->execute($packed, $p); + my $d = $u->data(); + $u->reset(); + is_deeply $d, $data, 'decoded'; +} + +is $u->utf8(0), $u, 'utf8(0)'; +$p = 0; +for(1 .. 2) { + ok !$u->get_utf8(); + $p = $u->execute($packed, $p); + my $d = $u->data(); + $u->reset(); + my $s = $data->[3]; + utf8::encode($s); + is_deeply $d->[3], $s, 'not decoded'; +} + +done_testing; + diff --git a/perl/xs-src/MessagePack.c b/perl/xs-src/MessagePack.c index 69337f41..0c3c0b16 100644 --- a/perl/xs-src/MessagePack.c +++ b/perl/xs-src/MessagePack.c @@ -7,6 +7,8 @@ XS(xs_pack); XS(xs_unpack); XS(xs_unpacker_new); +XS(xs_unpacker_utf8); +XS(xs_unpacker_get_utf8); XS(xs_unpacker_execute); XS(xs_unpacker_execute_limit); XS(xs_unpacker_is_finished); @@ -28,6 +30,8 @@ XS(boot_Data__MessagePack) { newXS("Data::MessagePack::unpack", xs_unpack, __FILE__); newXS("Data::MessagePack::Unpacker::new", xs_unpacker_new, __FILE__); + newXS("Data::MessagePack::Unpacker::utf8", xs_unpacker_utf8, __FILE__); + newXS("Data::MessagePack::Unpacker::get_utf8", xs_unpacker_get_utf8, __FILE__); newXS("Data::MessagePack::Unpacker::execute", xs_unpacker_execute, __FILE__); newXS("Data::MessagePack::Unpacker::execute_limit", xs_unpacker_execute_limit, __FILE__); newXS("Data::MessagePack::Unpacker::is_finished", xs_unpacker_is_finished, __FILE__); diff --git a/perl/xs-src/unpack.c b/perl/xs-src/unpack.c index 065573ab..caf86623 100644 --- a/perl/xs-src/unpack.c +++ b/perl/xs-src/unpack.c @@ -13,6 +13,7 @@ START_MY_CXT typedef struct { bool finished; bool incremented; + bool utf8; } unpack_user; #include "msgpack/unpack_define.h" @@ -237,6 +238,9 @@ STATIC_INLINE int template_callback_raw(unpack_user* u PERL_UNUSED_DECL, const c dTHX; /* newSVpvn(p, l) returns an undef if p == NULL */ *o = ((l==0) ? newSVpvs("") : newSVpvn(p, l)); + if(u->utf8) { + sv_utf8_decode(*o); + } return 0; } @@ -276,7 +280,7 @@ XS(xs_unpack) { msgpack_unpack_t mp; template_init(&mp); - unpack_user const u = {false, false}; + unpack_user const u = {false, false, false}; mp.user = u; size_t from = 0; @@ -303,7 +307,7 @@ XS(xs_unpack) { STATIC_INLINE void _reset(SV* const self) { dTHX; - unpack_user const u = {false, false}; + unpack_user const u = {false, false, false}; UNPACKER(self, mp); template_init(mp); @@ -328,6 +332,26 @@ XS(xs_unpacker_new) { XSRETURN(1); } +XS(xs_unpacker_utf8) { + dXSARGS; + if (!(items == 1 || items == 2)) { + Perl_croak(aTHX_ "Usage: $unpacker->utf8([$bool)"); + } + UNPACKER(ST(0), mp); + mp->user.utf8 = (items == 1 || sv_true(ST(1))) ? true : false; + XSRETURN(1); // returns $self +} + +XS(xs_unpacker_get_utf8) { + dXSARGS; + if (items != 1) { + Perl_croak(aTHX_ "Usage: $unpacker->get_utf8()"); + } + UNPACKER(ST(0), mp); + ST(0) = boolSV(mp->user.utf8); + XSRETURN(1); +} + STATIC_INLINE size_t _execute_impl(SV* const self, SV* const data, UV const offset, UV const limit) { dTHX; @@ -419,10 +443,12 @@ XS(xs_unpacker_reset) { } UNPACKER(ST(0), mp); + bool const utf8 = mp->user.utf8; // save SV* const data = template_data(mp); SvREFCNT_dec(data); _reset(ST(0)); + mp->user.utf8 = utf8; XSRETURN(0); } diff --git a/python/msgpack/_msgpack.pyx b/python/msgpack/_msgpack.pyx index c887127b..66869c80 100644 --- a/python/msgpack/_msgpack.pyx +++ b/python/msgpack/_msgpack.pyx @@ -7,6 +7,7 @@ cdef extern from "Python.h": cdef object PyBytes_FromStringAndSize(const_char_ptr b, Py_ssize_t len) cdef PyObject* Py_True cdef PyObject* Py_False + cdef object PyUnicode_AsUTF8String(object) cdef long long PyLong_AsLongLong(object o) cdef unsigned long long PyLong_AsUnsignedLongLong(object o) @@ -105,7 +106,7 @@ cdef class Packer(object): if ret == 0: ret = msgpack_pack_raw_body(&self.pk, rawval, len(o)) elif PyUnicode_Check(o): - o = o.encode('utf-8') + o = PyUnicode_AsUTF8String(o) rawval = o ret = msgpack_pack_raw(&self.pk, len(o)) if ret == 0: @@ -169,7 +170,7 @@ cdef extern from "unpack.h": object template_data(template_context* ctx) -def unpackb(object packed_bytes): +def unpackb(bytes packed_bytes): """Unpack packed_bytes to object. Returns an unpacked object.""" cdef const_char_ptr p = packed_bytes cdef template_context ctx @@ -232,7 +233,7 @@ cdef class Unpacker(object): cdef object file_like cdef int read_size cdef object waiting_bytes - cdef int use_list + cdef bint use_list def __cinit__(self): self.buf = NULL @@ -241,7 +242,7 @@ cdef class Unpacker(object): if self.buf: free(self.buf); - def __init__(self, file_like=None, int read_size=0, use_list=0): + def __init__(self, file_like=None, int read_size=0, bint use_list=0): if read_size == 0: read_size = 1024*1024 self.use_list = use_list diff --git a/python/test/test_case.py b/python/test/test_case.py index a08c6ce1..1cbc4945 100644 --- a/python/test/test_case.py +++ b/python/test/test_case.py @@ -98,5 +98,8 @@ def test_match(): for v, p in cases: match(v, p) +def test_unicode(): + assert_equal('foobar', unpacks(packs(u'foobar'))) + if __name__ == '__main__': main() diff --git a/python/test3/test_case.py b/python/test3/test_case.py index 53dfcaf0..2f423161 100644 --- a/python/test3/test_case.py +++ b/python/test3/test_case.py @@ -98,5 +98,8 @@ def test_match(): for v, p in cases: match(v, p) +def test_unicode(): + assert_equal(b'foobar', unpacks(packs('foobar'))) + if __name__ == '__main__': main()