From b5c78de2ddf82783a6f80a199b68927d1a1747ca Mon Sep 17 00:00:00 2001 From: frsyuki Date: Tue, 31 Aug 2010 06:30:16 +0900 Subject: [PATCH] ruby: converts encodings into UTF-8 on Ruby 1.9 --- ruby/encoding.h | 33 ++++++++++++++++++ ruby/pack.c | 21 ++++++++++-- ruby/rbinit.c | 15 +++++++++ ruby/test/test_encoding.rb | 68 ++++++++++++++++++++++++++++++++++++++ ruby/test/test_helper.rb | 4 ++- ruby/unpack.c | 38 +++------------------ 6 files changed, 141 insertions(+), 38 deletions(-) create mode 100644 ruby/encoding.h create mode 100644 ruby/test/test_encoding.rb diff --git a/ruby/encoding.h b/ruby/encoding.h new file mode 100644 index 00000000..2ad3fd7f --- /dev/null +++ b/ruby/encoding.h @@ -0,0 +1,33 @@ +/* + * MessagePack for Ruby + * + * Copyright (C) 2008-2010 FURUHASHI Sadayuki + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef ENCODING_H__ +#define ENCODING_H__ + + +#ifdef HAVE_RUBY_ENCODING_H +#include "ruby/encoding.h" +#define MSGPACK_RUBY_ENCODING +extern int s_enc_utf8; +extern int s_enc_ascii8bit; +extern int s_enc_usascii; +extern VALUE s_enc_utf8_value; +#endif + + +#endif /* encoding.h */ + diff --git a/ruby/pack.c b/ruby/pack.c index bbeac4a5..35878c7d 100644 --- a/ruby/pack.c +++ b/ruby/pack.c @@ -16,6 +16,8 @@ * limitations under the License. */ #include "ruby.h" +#include "encoding.h" + #include "msgpack/pack_define.h" static ID s_to_msgpack; @@ -131,7 +133,6 @@ static VALUE MessagePack_Fixnum_to_msgpack(int argc, VALUE *argv, VALUE self) static VALUE MessagePack_Bignum_to_msgpack(int argc, VALUE *argv, VALUE self) { ARG_BUFFER(out, argc, argv); - // FIXME bignum if(RBIGNUM_SIGN(self)) { // positive msgpack_pack_uint64(out, rb_big2ull(self)); } else { // negative @@ -168,6 +169,14 @@ static VALUE MessagePack_Float_to_msgpack(int argc, VALUE *argv, VALUE self) static VALUE MessagePack_String_to_msgpack(int argc, VALUE *argv, VALUE self) { ARG_BUFFER(out, argc, argv); +#ifdef MSGPACK_RUBY_ENCODING + int enc = ENCODING_GET(self); + if(enc != s_enc_utf8 && enc != s_enc_ascii8bit && enc != s_enc_usascii) { + if(!ENC_CODERANGE_ASCIIONLY(self)) { + self = rb_str_encode(self, s_enc_utf8_value, 0, Qnil); + } + } +#endif msgpack_pack_raw(out, RSTRING_LEN(self)); msgpack_pack_raw_body(out, RSTRING_PTR(self), RSTRING_LEN(self)); return out; @@ -184,12 +193,16 @@ static VALUE MessagePack_String_to_msgpack(int argc, VALUE *argv, VALUE self) */ static VALUE MessagePack_Symbol_to_msgpack(int argc, VALUE *argv, VALUE self) { +#ifdef MSGPACK_RUBY_ENCODING + return MessagePack_String_to_msgpack(argc, argv, rb_id2str(SYM2ID(self))); +#else ARG_BUFFER(out, argc, argv); const char* name = rb_id2name(SYM2ID(self)); size_t len = strlen(name); msgpack_pack_raw(out, len); msgpack_pack_raw_body(out, name, len); return out; +#endif } @@ -205,7 +218,8 @@ static VALUE MessagePack_Symbol_to_msgpack(int argc, VALUE *argv, VALUE self) static VALUE MessagePack_Array_to_msgpack(int argc, VALUE *argv, VALUE self) { ARG_BUFFER(out, argc, argv); - msgpack_pack_array(out, RARRAY_LEN(self)); + // FIXME check sizeof(long) > sizeof(unsigned int) && RARRAY_LEN(self) > UINT_MAX + msgpack_pack_array(out, (unsigned int)RARRAY_LEN(self)); VALUE* p = RARRAY_PTR(self); VALUE* const pend = p + RARRAY_LEN(self); for(;p != pend; ++p) { @@ -239,7 +253,8 @@ static int MessagePack_Hash_to_msgpack_foreach(VALUE key, VALUE value, VALUE out static VALUE MessagePack_Hash_to_msgpack(int argc, VALUE *argv, VALUE self) { ARG_BUFFER(out, argc, argv); - msgpack_pack_map(out, RHASH_SIZE(self)); + // FIXME check sizeof(st_index_t) > sizeof(unsigned int) && RARRAY_LEN(self) > UINT_MAX + msgpack_pack_map(out, (unsigned int)RHASH_SIZE(self)); rb_hash_foreach(self, MessagePack_Hash_to_msgpack_foreach, out); return out; } diff --git a/ruby/rbinit.c b/ruby/rbinit.c index 28a8bfec..46781594 100644 --- a/ruby/rbinit.c +++ b/ruby/rbinit.c @@ -17,9 +17,17 @@ */ #include "pack.h" #include "unpack.h" +#include "encoding.h" static VALUE mMessagePack; +#ifdef MSGPACK_RUBY_ENCODING +int s_enc_utf8; +int s_enc_ascii8bit; +int s_enc_usascii; +VALUE s_enc_utf8_value; +#endif + /** * Document-module: MessagePack * @@ -46,6 +54,13 @@ void Init_msgpack(void) rb_define_const(mMessagePack, "VERSION", rb_str_new2(MESSAGEPACK_VERSION)); +#ifdef MSGPACK_RUBY_ENCODING + s_enc_ascii8bit = rb_ascii8bit_encindex(); + s_enc_utf8 = rb_utf8_encindex(); + s_enc_usascii = rb_usascii_encindex(); + s_enc_utf8_value = rb_enc_from_encoding(rb_utf8_encoding()); +#endif + Init_msgpack_unpack(mMessagePack); Init_msgpack_pack(mMessagePack); } diff --git a/ruby/test/test_encoding.rb b/ruby/test/test_encoding.rb new file mode 100644 index 00000000..2cf0767c --- /dev/null +++ b/ruby/test/test_encoding.rb @@ -0,0 +1,68 @@ +#!/usr/bin/env ruby +require File.dirname(__FILE__)+'/test_helper' + +if RUBY_VERSION < "1.9" + exit +end + +class MessagePackTestEncoding < Test::Unit::TestCase + def self.it(name, &block) + define_method("test_#{name}", &block) + end + + it "US-ASCII" do + check_unpack "abc".force_encoding("US-ASCII") + end + + it "UTF-8 ascii" do + check_unpack "abc".force_encoding("UTF-8") + end + + it "UTF-8 mbstr" do + check_unpack "\xE3\x81\x82".force_encoding("UTF-8") + end + + it "UTF-8 invalid" do + check_unpack "\xD0".force_encoding("UTF-8") + end + + it "ASCII-8BIT" do + check_unpack "\xD0".force_encoding("ASCII-8BIT") + end + + it "EUC-JP" do + x = "\xA4\xA2".force_encoding("EUC-JP") + check_unpack(x) + end + + it "EUC-JP invalid" do + begin + "\xD0".force_encoding("EUC-JP").to_msgpack + assert(false) + rescue Encoding::InvalidByteSequenceError + assert(true) + end + end + + private + def check_unpack(str) + if str.encoding.to_s == "ASCII-8BIT" + should_str = str.dup.force_encoding("UTF-8") + else + should_str = str.encode("UTF-8") + end + + raw = str.to_msgpack + r = MessagePack.unpack(str.to_msgpack) + assert_equal(r.encoding.to_s, "UTF-8") + assert_equal(r, should_str.force_encoding("UTF-8")) + + if str.valid_encoding? + sym = str.to_sym + r = MessagePack.unpack(sym.to_msgpack) + assert_equal(r.encoding.to_s, "UTF-8") + assert_equal(r, should_str.force_encoding("UTF-8")) + end + end +end + diff --git a/ruby/test/test_helper.rb b/ruby/test/test_helper.rb index 80d7806a..4def861a 100644 --- a/ruby/test/test_helper.rb +++ b/ruby/test/test_helper.rb @@ -5,4 +5,6 @@ rescue LoadError require File.dirname(__FILE__) + '/../lib/msgpack' end -#GC.stress = true +if ENV["GC_STRESS"] + GC.stress = true +end diff --git a/ruby/unpack.c b/ruby/unpack.c index 09481510..3c5e3507 100644 --- a/ruby/unpack.c +++ b/ruby/unpack.c @@ -16,17 +16,13 @@ * limitations under the License. */ #include "ruby.h" +#include "encoding.h" #include "msgpack/unpack_define.h" static ID s_sysread; static ID s_readpartial; -#ifdef HAVE_RUBY_ENCODING_H -#include "ruby/encoding.h" -int s_ascii_8bit; -#endif - struct unpack_buffer { size_t size; size_t free; @@ -136,6 +132,9 @@ static inline int template_callback_raw(unpack_user* u, const char* b, const cha } else { *o = rb_str_substr(u->source, p - b, l); } +#ifdef MSGPACK_RUBY_ENCODING + ENCODING_SET(*o, s_enc_utf8); +#endif return 0; } @@ -163,16 +162,6 @@ static inline int template_callback_raw(unpack_user* u, const char* b, const cha #endif -#ifdef HAVE_RUBY_ENCODING_H -static VALUE template_execute_rescue_enc(VALUE data) -{ - rb_gc_enable(); - VALUE* resc = (VALUE*)data; - rb_enc_set_index(resc[0], (int)resc[1]); - RERAISE; -} -#endif - static VALUE template_execute_rescue(VALUE nouse) { rb_gc_enable(); @@ -203,31 +192,16 @@ static int template_execute_wrap(msgpack_unpack_t* mp, (VALUE)from, }; -#ifdef HAVE_RUBY_ENCODING_H - int enc_orig = rb_enc_get_index(str); - rb_enc_set_index(str, s_ascii_8bit); -#endif - // FIXME execute実行中はmp->topが更新されないのでGC markが機能しない rb_gc_disable(); mp->user.source = str; -#ifdef HAVE_RUBY_ENCODING_H - VALUE resc[2] = {str, enc_orig}; - int ret = (int)rb_rescue(template_execute_do, (VALUE)args, - template_execute_rescue_enc, (VALUE)resc); -#else int ret = (int)rb_rescue(template_execute_do, (VALUE)args, template_execute_rescue, Qnil); -#endif rb_gc_enable(); -#ifdef HAVE_RUBY_ENCODING_H - rb_enc_set_index(str, enc_orig); -#endif - return ret; } @@ -746,10 +720,6 @@ void Init_msgpack_unpack(VALUE mMessagePack) s_sysread = rb_intern("sysread"); s_readpartial = rb_intern("readpartial"); -#ifdef HAVE_RUBY_ENCODING_H - s_ascii_8bit = rb_enc_find_index("ASCII-8BIT"); -#endif - eUnpackError = rb_define_class_under(mMessagePack, "UnpackError", rb_eStandardError); cUnpacker = rb_define_class_under(mMessagePack, "Unpacker", rb_cObject); rb_define_alloc_func(cUnpacker, MessagePack_Unpacker_alloc);