ruby: converts encodings into UTF-8 on Ruby 1.9

This commit is contained in:
frsyuki 2010-08-31 06:30:16 +09:00
parent a1bd14e516
commit b5c78de2dd
6 changed files with 141 additions and 38 deletions

33
ruby/encoding.h Normal file
View File

@ -0,0 +1,33 @@
/*
* MessagePack for Ruby
*
* Copyright (C) 2008-2010 FURUHASHI Sadayuki
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef ENCODING_H__
#define ENCODING_H__
#ifdef HAVE_RUBY_ENCODING_H
#include "ruby/encoding.h"
#define MSGPACK_RUBY_ENCODING
extern int s_enc_utf8;
extern int s_enc_ascii8bit;
extern int s_enc_usascii;
extern VALUE s_enc_utf8_value;
#endif
#endif /* encoding.h */

View File

@ -16,6 +16,8 @@
* limitations under the License.
*/
#include "ruby.h"
#include "encoding.h"
#include "msgpack/pack_define.h"
static ID s_to_msgpack;
@ -131,7 +133,6 @@ static VALUE MessagePack_Fixnum_to_msgpack(int argc, VALUE *argv, VALUE self)
static VALUE MessagePack_Bignum_to_msgpack(int argc, VALUE *argv, VALUE self)
{
ARG_BUFFER(out, argc, argv);
// FIXME bignum
if(RBIGNUM_SIGN(self)) { // positive
msgpack_pack_uint64(out, rb_big2ull(self));
} else { // negative
@ -168,6 +169,14 @@ static VALUE MessagePack_Float_to_msgpack(int argc, VALUE *argv, VALUE self)
static VALUE MessagePack_String_to_msgpack(int argc, VALUE *argv, VALUE self)
{
ARG_BUFFER(out, argc, argv);
#ifdef MSGPACK_RUBY_ENCODING
int enc = ENCODING_GET(self);
if(enc != s_enc_utf8 && enc != s_enc_ascii8bit && enc != s_enc_usascii) {
if(!ENC_CODERANGE_ASCIIONLY(self)) {
self = rb_str_encode(self, s_enc_utf8_value, 0, Qnil);
}
}
#endif
msgpack_pack_raw(out, RSTRING_LEN(self));
msgpack_pack_raw_body(out, RSTRING_PTR(self), RSTRING_LEN(self));
return out;
@ -184,12 +193,16 @@ static VALUE MessagePack_String_to_msgpack(int argc, VALUE *argv, VALUE self)
*/
static VALUE MessagePack_Symbol_to_msgpack(int argc, VALUE *argv, VALUE self)
{
#ifdef MSGPACK_RUBY_ENCODING
return MessagePack_String_to_msgpack(argc, argv, rb_id2str(SYM2ID(self)));
#else
ARG_BUFFER(out, argc, argv);
const char* name = rb_id2name(SYM2ID(self));
size_t len = strlen(name);
msgpack_pack_raw(out, len);
msgpack_pack_raw_body(out, name, len);
return out;
#endif
}
@ -205,7 +218,8 @@ static VALUE MessagePack_Symbol_to_msgpack(int argc, VALUE *argv, VALUE self)
static VALUE MessagePack_Array_to_msgpack(int argc, VALUE *argv, VALUE self)
{
ARG_BUFFER(out, argc, argv);
msgpack_pack_array(out, RARRAY_LEN(self));
// FIXME check sizeof(long) > sizeof(unsigned int) && RARRAY_LEN(self) > UINT_MAX
msgpack_pack_array(out, (unsigned int)RARRAY_LEN(self));
VALUE* p = RARRAY_PTR(self);
VALUE* const pend = p + RARRAY_LEN(self);
for(;p != pend; ++p) {
@ -239,7 +253,8 @@ static int MessagePack_Hash_to_msgpack_foreach(VALUE key, VALUE value, VALUE out
static VALUE MessagePack_Hash_to_msgpack(int argc, VALUE *argv, VALUE self)
{
ARG_BUFFER(out, argc, argv);
msgpack_pack_map(out, RHASH_SIZE(self));
// FIXME check sizeof(st_index_t) > sizeof(unsigned int) && RARRAY_LEN(self) > UINT_MAX
msgpack_pack_map(out, (unsigned int)RHASH_SIZE(self));
rb_hash_foreach(self, MessagePack_Hash_to_msgpack_foreach, out);
return out;
}

View File

@ -17,9 +17,17 @@
*/
#include "pack.h"
#include "unpack.h"
#include "encoding.h"
static VALUE mMessagePack;
#ifdef MSGPACK_RUBY_ENCODING
int s_enc_utf8;
int s_enc_ascii8bit;
int s_enc_usascii;
VALUE s_enc_utf8_value;
#endif
/**
* Document-module: MessagePack
*
@ -46,6 +54,13 @@ void Init_msgpack(void)
rb_define_const(mMessagePack, "VERSION", rb_str_new2(MESSAGEPACK_VERSION));
#ifdef MSGPACK_RUBY_ENCODING
s_enc_ascii8bit = rb_ascii8bit_encindex();
s_enc_utf8 = rb_utf8_encindex();
s_enc_usascii = rb_usascii_encindex();
s_enc_utf8_value = rb_enc_from_encoding(rb_utf8_encoding());
#endif
Init_msgpack_unpack(mMessagePack);
Init_msgpack_pack(mMessagePack);
}

View File

@ -0,0 +1,68 @@
#!/usr/bin/env ruby
require File.dirname(__FILE__)+'/test_helper'
if RUBY_VERSION < "1.9"
exit
end
class MessagePackTestEncoding < Test::Unit::TestCase
def self.it(name, &block)
define_method("test_#{name}", &block)
end
it "US-ASCII" do
check_unpack "abc".force_encoding("US-ASCII")
end
it "UTF-8 ascii" do
check_unpack "abc".force_encoding("UTF-8")
end
it "UTF-8 mbstr" do
check_unpack "\xE3\x81\x82".force_encoding("UTF-8")
end
it "UTF-8 invalid" do
check_unpack "\xD0".force_encoding("UTF-8")
end
it "ASCII-8BIT" do
check_unpack "\xD0".force_encoding("ASCII-8BIT")
end
it "EUC-JP" do
x = "\xA4\xA2".force_encoding("EUC-JP")
check_unpack(x)
end
it "EUC-JP invalid" do
begin
"\xD0".force_encoding("EUC-JP").to_msgpack
assert(false)
rescue Encoding::InvalidByteSequenceError
assert(true)
end
end
private
def check_unpack(str)
if str.encoding.to_s == "ASCII-8BIT"
should_str = str.dup.force_encoding("UTF-8")
else
should_str = str.encode("UTF-8")
end
raw = str.to_msgpack
r = MessagePack.unpack(str.to_msgpack)
assert_equal(r.encoding.to_s, "UTF-8")
assert_equal(r, should_str.force_encoding("UTF-8"))
if str.valid_encoding?
sym = str.to_sym
r = MessagePack.unpack(sym.to_msgpack)
assert_equal(r.encoding.to_s, "UTF-8")
assert_equal(r, should_str.force_encoding("UTF-8"))
end
end
end

View File

@ -5,4 +5,6 @@ rescue LoadError
require File.dirname(__FILE__) + '/../lib/msgpack'
end
#GC.stress = true
if ENV["GC_STRESS"]
GC.stress = true
end

View File

@ -16,17 +16,13 @@
* limitations under the License.
*/
#include "ruby.h"
#include "encoding.h"
#include "msgpack/unpack_define.h"
static ID s_sysread;
static ID s_readpartial;
#ifdef HAVE_RUBY_ENCODING_H
#include "ruby/encoding.h"
int s_ascii_8bit;
#endif
struct unpack_buffer {
size_t size;
size_t free;
@ -136,6 +132,9 @@ static inline int template_callback_raw(unpack_user* u, const char* b, const cha
} else {
*o = rb_str_substr(u->source, p - b, l);
}
#ifdef MSGPACK_RUBY_ENCODING
ENCODING_SET(*o, s_enc_utf8);
#endif
return 0;
}
@ -163,16 +162,6 @@ static inline int template_callback_raw(unpack_user* u, const char* b, const cha
#endif
#ifdef HAVE_RUBY_ENCODING_H
static VALUE template_execute_rescue_enc(VALUE data)
{
rb_gc_enable();
VALUE* resc = (VALUE*)data;
rb_enc_set_index(resc[0], (int)resc[1]);
RERAISE;
}
#endif
static VALUE template_execute_rescue(VALUE nouse)
{
rb_gc_enable();
@ -203,31 +192,16 @@ static int template_execute_wrap(msgpack_unpack_t* mp,
(VALUE)from,
};
#ifdef HAVE_RUBY_ENCODING_H
int enc_orig = rb_enc_get_index(str);
rb_enc_set_index(str, s_ascii_8bit);
#endif
// FIXME execute実行中はmp->topが更新されないのでGC markが機能しない
rb_gc_disable();
mp->user.source = str;
#ifdef HAVE_RUBY_ENCODING_H
VALUE resc[2] = {str, enc_orig};
int ret = (int)rb_rescue(template_execute_do, (VALUE)args,
template_execute_rescue_enc, (VALUE)resc);
#else
int ret = (int)rb_rescue(template_execute_do, (VALUE)args,
template_execute_rescue, Qnil);
#endif
rb_gc_enable();
#ifdef HAVE_RUBY_ENCODING_H
rb_enc_set_index(str, enc_orig);
#endif
return ret;
}
@ -746,10 +720,6 @@ void Init_msgpack_unpack(VALUE mMessagePack)
s_sysread = rb_intern("sysread");
s_readpartial = rb_intern("readpartial");
#ifdef HAVE_RUBY_ENCODING_H
s_ascii_8bit = rb_enc_find_index("ASCII-8BIT");
#endif
eUnpackError = rb_define_class_under(mMessagePack, "UnpackError", rb_eStandardError);
cUnpacker = rb_define_class_under(mMessagePack, "Unpacker", rb_cObject);
rb_define_alloc_func(cUnpacker, MessagePack_Unpacker_alloc);