From ace4f1e7f0fb832d1c52ee3f64d547328264b83f Mon Sep 17 00:00:00 2001 From: Hideyuki Tanaka Date: Tue, 29 Mar 2011 16:15:29 +0900 Subject: [PATCH] fix encoding unicode strings, and Text support --- haskell/msgpack.cabal | 4 +++ haskell/src/Data/MessagePack/Internal/Utf8.hs | 28 ++++++++++++++++ haskell/src/Data/MessagePack/Object.hs | 28 +++++++++++++--- haskell/src/Data/MessagePack/Pack.hs | 33 ++++++++++++------- haskell/src/Data/MessagePack/Unpack.hs | 27 +++++++++------ 5 files changed, 95 insertions(+), 25 deletions(-) create mode 100644 haskell/src/Data/MessagePack/Internal/Utf8.hs diff --git a/haskell/msgpack.cabal b/haskell/msgpack.cabal index 3afafd42..14b37671 100644 --- a/haskell/msgpack.cabal +++ b/haskell/msgpack.cabal @@ -23,6 +23,7 @@ Library Build-depends: base >=4 && <5, transformers >= 0.2 && < 0.3, bytestring >= 0.9 && < 0.10, + text >= 0.11 && < 0.12, vector >= 0.7 && < 0.8, attoparsec >= 0.8 && < 0.9, binary >= 0.5.0 && < 0.5.1, @@ -41,6 +42,9 @@ Library Data.MessagePack.Object Data.MessagePack.Derive + Other-modules: + Data.MessagePack.Internal.Utf8 + Source-repository head Type: git Location: git://github.com/msgpack/msgpack.git diff --git a/haskell/src/Data/MessagePack/Internal/Utf8.hs b/haskell/src/Data/MessagePack/Internal/Utf8.hs new file mode 100644 index 00000000..c109faaa --- /dev/null +++ b/haskell/src/Data/MessagePack/Internal/Utf8.hs @@ -0,0 +1,28 @@ +module Data.MessagePack.Internal.Utf8 ( + encodeUtf8, + decodeUtf8, + skipChar, + toLBS, + fromLBS, + ) where + +import qualified Data.ByteString as B +import qualified Data.ByteString.Lazy as BL +import qualified Data.Text as T +import qualified Data.Text.Encoding as T +import qualified Data.Text.Encoding.Error as T + +encodeUtf8 :: String -> B.ByteString +encodeUtf8 = T.encodeUtf8 . T.pack + +decodeUtf8 :: B.ByteString -> String +decodeUtf8 = T.unpack . T.decodeUtf8With skipChar + +skipChar :: T.OnDecodeError +skipChar _ _ = Nothing + +toLBS :: B.ByteString -> BL.ByteString +toLBS bs = BL.fromChunks [bs] + +fromLBS :: BL.ByteString -> B.ByteString +fromLBS = B.concat . BL.toChunks diff --git a/haskell/src/Data/MessagePack/Object.hs b/haskell/src/Data/MessagePack/Object.hs index 3eb36587..aaad669c 100644 --- a/haskell/src/Data/MessagePack/Object.hs +++ b/haskell/src/Data/MessagePack/Object.hs @@ -32,12 +32,17 @@ import Control.Monad import Control.Monad.Trans.Error () import qualified Data.Attoparsec as A import qualified Data.ByteString as B -import qualified Data.ByteString.Char8 as C8 +import qualified Data.ByteString.Lazy as BL +import qualified Data.Text as T +import qualified Data.Text.Encoding as T +import qualified Data.Text.Lazy as TL +import qualified Data.Text.Lazy.Encoding as TL import Data.Typeable import Data.MessagePack.Assoc import Data.MessagePack.Pack import Data.MessagePack.Unpack +import Data.MessagePack.Internal.Utf8 -- | Object Representation of MessagePack data. data Object @@ -149,14 +154,29 @@ instance OBJECT Float where tryFromObject (ObjectFloat f) = Right f tryFromObject _ = tryFromObjectError +instance OBJECT String where + toObject = toObject . encodeUtf8 + tryFromObject obj = liftM decodeUtf8 $ tryFromObject obj + instance OBJECT B.ByteString where toObject = ObjectRAW tryFromObject (ObjectRAW bs) = Right bs tryFromObject _ = tryFromObjectError -instance OBJECT String where - toObject = toObject . C8.pack - tryFromObject obj = liftM C8.unpack $ tryFromObject obj +instance OBJECT BL.ByteString where + toObject = ObjectRAW . fromLBS + tryFromObject (ObjectRAW bs) = Right $ toLBS bs + tryFromObject _ = tryFromObjectError + +instance OBJECT T.Text where + toObject = ObjectRAW . T.encodeUtf8 + tryFromObject (ObjectRAW bs) = Right $ T.decodeUtf8With skipChar bs + tryFromObject _ = tryFromObjectError + +instance OBJECT TL.Text where + toObject = ObjectRAW . fromLBS . TL.encodeUtf8 + tryFromObject (ObjectRAW bs) = Right $ TL.decodeUtf8With skipChar $ toLBS bs + tryFromObject _ = tryFromObjectError instance OBJECT a => OBJECT [a] where toObject = ObjectArray . map toObject diff --git a/haskell/src/Data/MessagePack/Pack.hs b/haskell/src/Data/MessagePack/Pack.hs index e943765a..39394ff6 100644 --- a/haskell/src/Data/MessagePack/Pack.hs +++ b/haskell/src/Data/MessagePack/Pack.hs @@ -27,11 +27,15 @@ import Data.Binary.Put import Data.Binary.IEEE754 import Data.Bits import qualified Data.ByteString as B -import qualified Data.ByteString.Char8 as B8 -import qualified Data.ByteString.Lazy as L +import qualified Data.ByteString.Lazy as BL +import qualified Data.Text as T +import qualified Data.Text.Encoding as T +import qualified Data.Text.Lazy as TL +import qualified Data.Text.Lazy.Encoding as TL import qualified Data.Vector as V import Data.MessagePack.Assoc +import Data.MessagePack.Internal.Utf8 -- | Serializable class class Packable a where @@ -39,7 +43,7 @@ class Packable a where put :: a -> Put -- | Pack Haskell data to MessagePack string. -pack :: Packable a => a -> L.ByteString +pack :: Packable a => a -> BL.ByteString pack = runPut . put instance Packable Int where @@ -93,17 +97,24 @@ instance Packable Double where putFloat64be d instance Packable String where - put = putString length (putByteString . B8.pack) + put = putString encodeUtf8 B.length putByteString instance Packable B.ByteString where - put = putString B.length putByteString + put = putString id B.length putByteString -instance Packable L.ByteString where - put = putString (fromIntegral . L.length) putLazyByteString +instance Packable BL.ByteString where + put = putString id (fromIntegral . BL.length) putLazyByteString -putString :: (s -> Int) -> (s -> Put) -> s -> Put -putString lf pf str = do - case lf str of +instance Packable T.Text where + put = putString T.encodeUtf8 B.length putByteString + +instance Packable TL.Text where + put = putString TL.encodeUtf8 (fromIntegral . BL.length) putLazyByteString + +putString :: (s -> t) -> (t -> Int) -> (t -> Put) -> s -> Put +putString cnv lf pf str = do + let bs = cnv str + case lf bs of len | len <= 31 -> do putWord8 $ 0xA0 .|. fromIntegral len len | len < 0x10000 -> do @@ -112,7 +123,7 @@ putString lf pf str = do len -> do putWord8 0xDB putWord32be $ fromIntegral len - pf str + pf bs instance Packable a => Packable [a] where put = putArray length (mapM_ put) diff --git a/haskell/src/Data/MessagePack/Unpack.hs b/haskell/src/Data/MessagePack/Unpack.hs index 20deafad..66a61a22 100644 --- a/haskell/src/Data/MessagePack/Unpack.hs +++ b/haskell/src/Data/MessagePack/Unpack.hs @@ -36,8 +36,11 @@ import Data.Binary.Get import Data.Binary.IEEE754 import Data.Bits import qualified Data.ByteString as B -import qualified Data.ByteString.Char8 as B8 -import qualified Data.ByteString.Lazy as L +import qualified Data.ByteString.Lazy as BL +import qualified Data.Text as T +import qualified Data.Text.Encoding as T +import qualified Data.Text.Lazy as TL +import qualified Data.Text.Lazy.Encoding as TL import Data.Int import Data.Typeable import qualified Data.Vector as V @@ -45,6 +48,7 @@ import Data.Word import Text.Printf import Data.MessagePack.Assoc +import Data.MessagePack.Internal.Utf8 -- | Deserializable class class Unpackable a where @@ -57,8 +61,8 @@ class IsByteString s where instance IsByteString B.ByteString where toBS = id -instance IsByteString L.ByteString where - toBS = B.concat . L.toChunks +instance IsByteString BL.ByteString where + toBS = B.concat . BL.toChunks -- | The exception of unpack data UnpackError = @@ -153,13 +157,19 @@ instance Unpackable Double where fail $ printf "invlid double tag: 0x%02X" c instance Unpackable String where - get = parseString (\n -> return . B8.unpack =<< A.take n) + get = parseString (\n -> return . decodeUtf8 =<< A.take n) instance Unpackable B.ByteString where get = parseString A.take -instance Unpackable L.ByteString where - get = parseString (\n -> do bs <- A.take n; return $ L.fromChunks [bs]) +instance Unpackable BL.ByteString where + get = parseString (\n -> return . toLBS =<< A.take n) + +instance Unpackable T.Text where + get = parseString (\n -> return . T.decodeUtf8With skipChar =<< A.take n) + +instance Unpackable TL.Text where + get = parseString (\n -> return . TL.decodeUtf8With skipChar . toLBS =<< A.take n) parseString :: (Int -> A.Parser a) -> A.Parser a parseString aget = do @@ -311,6 +321,3 @@ parseInt32 = return . fromIntegral =<< parseUint32 parseInt64 :: A.Parser Int64 parseInt64 = return . fromIntegral =<< parseUint64 - -toLBS :: B.ByteString -> L.ByteString -toLBS bs = L.fromChunks [bs]