From a4a04872a3e7d331c722938a719711cb3178a5c7 Mon Sep 17 00:00:00 2001 From: "Fuji, Goro" Date: Tue, 5 Oct 2010 17:10:10 +0900 Subject: [PATCH 1/9] perl: add $unpacker->utf8 mode, decoding strings as UTF-8. --- perl/lib/Data/MessagePack/PP.pm | 21 ++++++++++++++++--- perl/lib/Data/MessagePack/Unpacker.pod | 11 ++++++++++ perl/t/15_utf8.t | 27 +++++++++++++++++++++++++ perl/xs-src/MessagePack.c | 4 ++++ perl/xs-src/unpack.c | 28 ++++++++++++++++++++++++-- 5 files changed, 86 insertions(+), 5 deletions(-) create mode 100644 perl/t/15_utf8.t diff --git a/perl/lib/Data/MessagePack/PP.pm b/perl/lib/Data/MessagePack/PP.pm index 5dccc0bb..00e58b92 100644 --- a/perl/lib/Data/MessagePack/PP.pm +++ b/perl/lib/Data/MessagePack/PP.pm @@ -248,6 +248,7 @@ sub _pack { # UNPACK # +our $_utf8 = 0; my $p; # position variables for speed. sub unpack :method { @@ -358,7 +359,9 @@ sub _unpack { $num = $byte & ~0xa0; $p += $num; } - return substr( $value, $p - $num, $num ); + my $s = substr( $value, $p - $num, $num ); + utf8::decode($s) if $_utf8; + return $s; } elsif ( $byte == 0xc0 ) { # nil @@ -396,9 +399,19 @@ package Data::MessagePack::PP::Unpacker; sub new { - bless { pos => 0 }, shift; + bless { pos => 0, utf8 => 0 }, shift; } +sub utf8 { + my $self = shift; + $self->{utf8} = (@_ ? shift : 1); + return $self; +} + +sub get_utf8 { + my($self) = @_; + return $self->{utf8}; +} sub execute_limit { execute( @_ ); @@ -540,7 +553,9 @@ sub _count { sub data { - return Data::MessagePack->unpack( substr($_[0]->{ data }, 0, $_[0]->{pos}) ); + my($self) = @_; + local $Data::MessagePack::PP::_utf8 = $self->{utf8}; + return Data::MessagePack->unpack( substr($self->{ data }, 0, $self->{pos}) ); } diff --git a/perl/lib/Data/MessagePack/Unpacker.pod b/perl/lib/Data/MessagePack/Unpacker.pod index 2bc4549c..37ab3db8 100644 --- a/perl/lib/Data/MessagePack/Unpacker.pod +++ b/perl/lib/Data/MessagePack/Unpacker.pod @@ -24,6 +24,17 @@ This is a streaming deserializer for messagepack. creates a new instance of stream deserializer. +=item $up->utf8([$bool]) + +sets utf8 mode. true if I<$bool> is omitted. +returns I<$up> itself. + +If utf8 mode is enabled, strings will be decoded as UTF-8. + +=item my $ret = $up->get_utf8() + +returns the utf8 mode flag of I<$up>. + =item my $ret = $up->execute($data, $offset); =item my $ret = $up->execute_limit($data, $offset, $limit) diff --git a/perl/t/15_utf8.t b/perl/t/15_utf8.t new file mode 100644 index 00000000..d7d17b88 --- /dev/null +++ b/perl/t/15_utf8.t @@ -0,0 +1,27 @@ +#!perl -w +use strict; +use Test::More; +use Data::MessagePack; +use utf8; + +my $data = [42, undef, 'foo', "\x{99f1}\x{99dd}"]; +my $packed = Data::MessagePack->pack($data); + +my $u = Data::MessagePack::Unpacker->new()->utf8(); +ok $u->get_utf8(); +$u->execute($packed); +my $d = $u->data(); +$u->reset(); +is_deeply $d, $data, 'decoded'; + +is $u->utf8(0), $u, 'utf8(0)'; +ok !$u->get_utf8(); +$u->execute($packed); +$d = $u->data(); +$u->reset(); +my $s = $data->[3]; +utf8::encode($s); +is_deeply $d->[3], $s, 'not decoded'; + +done_testing; + diff --git a/perl/xs-src/MessagePack.c b/perl/xs-src/MessagePack.c index 69337f41..0c3c0b16 100644 --- a/perl/xs-src/MessagePack.c +++ b/perl/xs-src/MessagePack.c @@ -7,6 +7,8 @@ XS(xs_pack); XS(xs_unpack); XS(xs_unpacker_new); +XS(xs_unpacker_utf8); +XS(xs_unpacker_get_utf8); XS(xs_unpacker_execute); XS(xs_unpacker_execute_limit); XS(xs_unpacker_is_finished); @@ -28,6 +30,8 @@ XS(boot_Data__MessagePack) { newXS("Data::MessagePack::unpack", xs_unpack, __FILE__); newXS("Data::MessagePack::Unpacker::new", xs_unpacker_new, __FILE__); + newXS("Data::MessagePack::Unpacker::utf8", xs_unpacker_utf8, __FILE__); + newXS("Data::MessagePack::Unpacker::get_utf8", xs_unpacker_get_utf8, __FILE__); newXS("Data::MessagePack::Unpacker::execute", xs_unpacker_execute, __FILE__); newXS("Data::MessagePack::Unpacker::execute_limit", xs_unpacker_execute_limit, __FILE__); newXS("Data::MessagePack::Unpacker::is_finished", xs_unpacker_is_finished, __FILE__); diff --git a/perl/xs-src/unpack.c b/perl/xs-src/unpack.c index 065573ab..f39d8c11 100644 --- a/perl/xs-src/unpack.c +++ b/perl/xs-src/unpack.c @@ -13,6 +13,7 @@ START_MY_CXT typedef struct { bool finished; bool incremented; + bool utf8; } unpack_user; #include "msgpack/unpack_define.h" @@ -237,6 +238,9 @@ STATIC_INLINE int template_callback_raw(unpack_user* u PERL_UNUSED_DECL, const c dTHX; /* newSVpvn(p, l) returns an undef if p == NULL */ *o = ((l==0) ? newSVpvs("") : newSVpvn(p, l)); + if(u->utf8) { + sv_utf8_decode(*o); + } return 0; } @@ -276,7 +280,7 @@ XS(xs_unpack) { msgpack_unpack_t mp; template_init(&mp); - unpack_user const u = {false, false}; + unpack_user const u = {false, false, false}; mp.user = u; size_t from = 0; @@ -303,7 +307,7 @@ XS(xs_unpack) { STATIC_INLINE void _reset(SV* const self) { dTHX; - unpack_user const u = {false, false}; + unpack_user const u = {false, false, false}; UNPACKER(self, mp); template_init(mp); @@ -328,6 +332,26 @@ XS(xs_unpacker_new) { XSRETURN(1); } +XS(xs_unpacker_utf8) { + dXSARGS; + if (!(items == 1 || items == 2)) { + Perl_croak(aTHX_ "Usage: $unpacker->utf8([$bool)"); + } + UNPACKER(ST(0), mp); + mp->user.utf8 = (items == 1 || sv_true(ST(1))) ? true : false; + XSRETURN(1); // returns $self +} + +XS(xs_unpacker_get_utf8) { + dXSARGS; + if (items != 1) { + Perl_croak(aTHX_ "Usage: $unpacker->get_utf8()"); + } + UNPACKER(ST(0), mp); + ST(0) = boolSV(mp->user.utf8); + XSRETURN(1); +} + STATIC_INLINE size_t _execute_impl(SV* const self, SV* const data, UV const offset, UV const limit) { dTHX; From 7c92f8a90b54a7cf2279211fc4601e11872bee38 Mon Sep 17 00:00:00 2001 From: "Fuji, Goro" Date: Tue, 5 Oct 2010 17:47:27 +0900 Subject: [PATCH 2/9] perl: improve docs --- perl/lib/Data/MessagePack.pm | 6 ++++++ perl/lib/Data/MessagePack/Unpacker.pod | 2 ++ 2 files changed, 8 insertions(+) diff --git a/perl/lib/Data/MessagePack.pm b/perl/lib/Data/MessagePack.pm index 7d1bda78..3d2d6954 100644 --- a/perl/lib/Data/MessagePack.pm +++ b/perl/lib/Data/MessagePack.pm @@ -165,6 +165,12 @@ will astonish those who try to unpack byte streams with an arbitrary buffer size (e.g. C<< while(read($socket, $buffer, $arbitrary_buffer_size)) { ... } >>). We should implement the internal buffer for the unpacker. +=item UTF8 mode + +Data::MessagePack::Unpacker supports utf8 mode, which decodes strings +as UTF8-8. << Data::MessagePack->unpack >> should support utf8 mode in a +future. + =back =head1 AUTHORS diff --git a/perl/lib/Data/MessagePack/Unpacker.pod b/perl/lib/Data/MessagePack/Unpacker.pod index 37ab3db8..24dafd00 100644 --- a/perl/lib/Data/MessagePack/Unpacker.pod +++ b/perl/lib/Data/MessagePack/Unpacker.pod @@ -31,6 +31,8 @@ returns I<$up> itself. If utf8 mode is enabled, strings will be decoded as UTF-8. +The utf8 mode is disabled by default. + =item my $ret = $up->get_utf8() returns the utf8 mode flag of I<$up>. From 4b93b0632379a30ac0d8c5debd26f7ad28d64faa Mon Sep 17 00:00:00 2001 From: tokuhirom Date: Wed, 6 Oct 2010 12:27:04 +0900 Subject: [PATCH 3/9] perl: regenerated README file --- perl/README | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/perl/README b/perl/README index 224ff08c..ae924ee6 100644 --- a/perl/README +++ b/perl/README @@ -102,6 +102,11 @@ TODO "while(read($socket, $buffer, $arbitrary_buffer_size)) { ... }"). We should implement the internal buffer for the unpacker. + UTF8 mode + Data::MessagePack::Unpacker supports utf8 mode, which decodes + strings as UTF8-8. << Data::MessagePack->unpack >> should support + utf8 mode in a future. + AUTHORS Tokuhiro Matsuno From 4321b80999696a6d4e5340fec8f729e11c0ada06 Mon Sep 17 00:00:00 2001 From: tokuhirom Date: Wed, 6 Oct 2010 12:29:31 +0900 Subject: [PATCH 4/9] Checking in changes prior to tagging of version 0.29. Changelog diff is: diff --git a/perl/Changes b/perl/Changes index b506234..333a824 100644 --- a/perl/Changes +++ b/perl/Changes @@ -1,3 +1,7 @@ +0.29 + + - add $unpacker->utf8 mode, decoding strings as UTF-8. + 0.28 - added more tests(gfx) --- perl/Changes | 4 ++++ perl/lib/Data/MessagePack.pm | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/perl/Changes b/perl/Changes index b506234b..333a824e 100644 --- a/perl/Changes +++ b/perl/Changes @@ -1,3 +1,7 @@ +0.29 + + - add $unpacker->utf8 mode, decoding strings as UTF-8. + 0.28 - added more tests(gfx) diff --git a/perl/lib/Data/MessagePack.pm b/perl/lib/Data/MessagePack.pm index 3d2d6954..01e0fc05 100644 --- a/perl/lib/Data/MessagePack.pm +++ b/perl/lib/Data/MessagePack.pm @@ -3,7 +3,7 @@ use strict; use warnings; use 5.008001; -our $VERSION = '0.28'; +our $VERSION = '0.29'; our $PreferInteger = 0; sub true () { From 77a7d3d26aaf957c4f231fdc2090d1b8e842e78c Mon Sep 17 00:00:00 2001 From: "Fuji, Goro" Date: Wed, 6 Oct 2010 17:52:32 +0900 Subject: [PATCH 5/9] perl: Fix utf8 mode not to be reseted by $unpacker->reset method --- perl/Changes | 5 +++++ perl/t/15_utf8.t | 32 +++++++++++++++++++------------- perl/xs-src/unpack.c | 2 ++ 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/perl/Changes b/perl/Changes index 333a824e..091c875a 100644 --- a/perl/Changes +++ b/perl/Changes @@ -1,3 +1,8 @@ + +0.30 + + - fix utf8 mode not to be reseted by $unpacker->reset method + 0.29 - add $unpacker->utf8 mode, decoding strings as UTF-8. diff --git a/perl/t/15_utf8.t b/perl/t/15_utf8.t index d7d17b88..f3163dfa 100644 --- a/perl/t/15_utf8.t +++ b/perl/t/15_utf8.t @@ -5,23 +5,29 @@ use Data::MessagePack; use utf8; my $data = [42, undef, 'foo', "\x{99f1}\x{99dd}"]; -my $packed = Data::MessagePack->pack($data); +my $packed = Data::MessagePack->pack($data) x 2; my $u = Data::MessagePack::Unpacker->new()->utf8(); -ok $u->get_utf8(); -$u->execute($packed); -my $d = $u->data(); -$u->reset(); -is_deeply $d, $data, 'decoded'; +my $p = 0; +for(1 .. 2) { + ok $u->get_utf8(); + $p = $u->execute($packed, $p); + my $d = $u->data(); + $u->reset(); + is_deeply $d, $data, 'decoded'; +} is $u->utf8(0), $u, 'utf8(0)'; -ok !$u->get_utf8(); -$u->execute($packed); -$d = $u->data(); -$u->reset(); -my $s = $data->[3]; -utf8::encode($s); -is_deeply $d->[3], $s, 'not decoded'; +$p = 0; +for(1 .. 2) { + ok !$u->get_utf8(); + $p = $u->execute($packed, $p); + my $d = $u->data(); + $u->reset(); + my $s = $data->[3]; + utf8::encode($s); + is_deeply $d->[3], $s, 'not decoded'; +} done_testing; diff --git a/perl/xs-src/unpack.c b/perl/xs-src/unpack.c index f39d8c11..caf86623 100644 --- a/perl/xs-src/unpack.c +++ b/perl/xs-src/unpack.c @@ -443,10 +443,12 @@ XS(xs_unpacker_reset) { } UNPACKER(ST(0), mp); + bool const utf8 = mp->user.utf8; // save SV* const data = template_data(mp); SvREFCNT_dec(data); _reset(ST(0)); + mp->user.utf8 = utf8; XSRETURN(0); } From 90da951a6f385b9860df1bdf4504e534e2a00397 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Thu, 7 Oct 2010 03:02:02 +0900 Subject: [PATCH 6/9] Use PyUnicode_AsUTF8String() instead of o.encode('utf-8'). --- python/msgpack/_msgpack.pyx | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/msgpack/_msgpack.pyx b/python/msgpack/_msgpack.pyx index c887127b..66869c80 100644 --- a/python/msgpack/_msgpack.pyx +++ b/python/msgpack/_msgpack.pyx @@ -7,6 +7,7 @@ cdef extern from "Python.h": cdef object PyBytes_FromStringAndSize(const_char_ptr b, Py_ssize_t len) cdef PyObject* Py_True cdef PyObject* Py_False + cdef object PyUnicode_AsUTF8String(object) cdef long long PyLong_AsLongLong(object o) cdef unsigned long long PyLong_AsUnsignedLongLong(object o) @@ -105,7 +106,7 @@ cdef class Packer(object): if ret == 0: ret = msgpack_pack_raw_body(&self.pk, rawval, len(o)) elif PyUnicode_Check(o): - o = o.encode('utf-8') + o = PyUnicode_AsUTF8String(o) rawval = o ret = msgpack_pack_raw(&self.pk, len(o)) if ret == 0: @@ -169,7 +170,7 @@ cdef extern from "unpack.h": object template_data(template_context* ctx) -def unpackb(object packed_bytes): +def unpackb(bytes packed_bytes): """Unpack packed_bytes to object. Returns an unpacked object.""" cdef const_char_ptr p = packed_bytes cdef template_context ctx @@ -232,7 +233,7 @@ cdef class Unpacker(object): cdef object file_like cdef int read_size cdef object waiting_bytes - cdef int use_list + cdef bint use_list def __cinit__(self): self.buf = NULL @@ -241,7 +242,7 @@ cdef class Unpacker(object): if self.buf: free(self.buf); - def __init__(self, file_like=None, int read_size=0, use_list=0): + def __init__(self, file_like=None, int read_size=0, bint use_list=0): if read_size == 0: read_size = 1024*1024 self.use_list = use_list From 84673072396650520e24b5a770c11aa777f69557 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Thu, 7 Oct 2010 03:04:00 +0900 Subject: [PATCH 7/9] Fix testcase for unicode. --- python/test3/test_case.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/test3/test_case.py b/python/test3/test_case.py index 53dfcaf0..2f423161 100644 --- a/python/test3/test_case.py +++ b/python/test3/test_case.py @@ -98,5 +98,8 @@ def test_match(): for v, p in cases: match(v, p) +def test_unicode(): + assert_equal(b'foobar', unpacks(packs('foobar'))) + if __name__ == '__main__': main() From aa2a3e5b075f60c9c5143528ab6520c6231aec0a Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Thu, 7 Oct 2010 03:07:52 +0900 Subject: [PATCH 8/9] Add unicode testcase for Python2. --- python/test/test_case.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/test/test_case.py b/python/test/test_case.py index a08c6ce1..1cbc4945 100644 --- a/python/test/test_case.py +++ b/python/test/test_case.py @@ -98,5 +98,8 @@ def test_match(): for v, p in cases: match(v, p) +def test_unicode(): + assert_equal('foobar', unpacks(packs(u'foobar'))) + if __name__ == '__main__': main() From f3ee5ab37266f61efb029365cf8689191ed2f756 Mon Sep 17 00:00:00 2001 From: frsyuki Date: Thu, 7 Oct 2010 06:28:12 +0900 Subject: [PATCH 9/9] cpp: fixes ./bootstrap to surely run ./preprocess --- cpp/bootstrap | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/bootstrap b/cpp/bootstrap index 7f3a182a..1ff6b76e 100755 --- a/cpp/bootstrap +++ b/cpp/bootstrap @@ -38,7 +38,8 @@ test -f ChangeLog || touch ChangeLog test -f NEWS || touch NEWS test -f README || cp -f README.md README -if test ! ./preprocess; then +./preprocess +if [ $? -ne 0 ]; then exit 1 fi