147 lines
4.0 KiB
C++
147 lines
4.0 KiB
C++
// Copyright Vladimir Prus 2002-2004.
|
|
// Distributed under the Boost Software License, Version 1.0.
|
|
// (See accompanying file LICENSE_1_0.txt
|
|
// or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
#include <cstring>
|
|
#include <cassert>
|
|
#include <string>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
#include <boost/progress.hpp>
|
|
#include <boost/bind.hpp>
|
|
#include <boost/ref.hpp>
|
|
|
|
#include <boost/program_options/detail/convert.hpp>
|
|
#include <boost/program_options/detail/utf8_codecvt_facet.hpp>
|
|
|
|
#include "minitest.hpp"
|
|
|
|
using namespace std;
|
|
|
|
string file_content(const string& filename)
|
|
{
|
|
ifstream ifs(filename.c_str());
|
|
assert(ifs);
|
|
|
|
stringstream ss;
|
|
ss << ifs.rdbuf();
|
|
|
|
return ss.str();
|
|
}
|
|
|
|
// A version of from_8_bit which does not use functional object, for
|
|
// performance comparison.
|
|
std::wstring from_8_bit_2(const std::string& s,
|
|
const codecvt<wchar_t, char, mbstate_t>& cvt)
|
|
{
|
|
std::wstring result;
|
|
|
|
|
|
std::mbstate_t state = std::mbstate_t();
|
|
|
|
const char* from = s.data();
|
|
const char* from_end = s.data() + s.size();
|
|
// The interace of cvt is not really iterator-like, and it's
|
|
// not possible the tell the required output size without the conversion.
|
|
// All we can is convert data by pieces.
|
|
while(from != from_end) {
|
|
|
|
// std::basic_string does not provide non-const pointers to the data,
|
|
// so converting directly into string is not possible.
|
|
wchar_t buffer[32];
|
|
|
|
wchar_t* to_next = buffer;
|
|
// Try to convert remaining input.
|
|
std::codecvt_base::result r =
|
|
cvt.in(state, from, from_end, from, buffer, buffer + 32, to_next);
|
|
|
|
if (r == std::codecvt_base::error)
|
|
throw logic_error("character conversion failed");
|
|
// 'partial' is not an error, it just means not all source characters
|
|
// we converted. However, we need to check that at least one new target
|
|
// character was produced. If not, it means the source data is
|
|
// incomplete, and since we don't have extra data to add to source, it's
|
|
// error.
|
|
if (to_next == buffer)
|
|
throw logic_error("character conversion failed");
|
|
|
|
// Add converted characters
|
|
result.append(buffer, to_next);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
void test_convert(const std::string& input,
|
|
const std::string& expected_output)
|
|
{
|
|
boost::program_options::detail::utf8_codecvt_facet facet;
|
|
|
|
std::wstring output;
|
|
{
|
|
boost::progress_timer t;
|
|
for (int i = 0; i < 10000; ++i)
|
|
output = boost::from_8_bit(
|
|
input,
|
|
facet);
|
|
}
|
|
|
|
{
|
|
boost::progress_timer t;
|
|
for (int i = 0; i < 10000; ++i)
|
|
output = from_8_bit_2(
|
|
input,
|
|
facet);
|
|
}
|
|
|
|
BOOST_CHECK(output.size()*2 == expected_output.size());
|
|
|
|
for(unsigned i = 0; i < output.size(); ++i) {
|
|
|
|
{
|
|
unsigned low = output[i];
|
|
low &= 0xFF;
|
|
unsigned low2 = expected_output[2*i];
|
|
low2 &= 0xFF;
|
|
BOOST_CHECK(low == low2);
|
|
}
|
|
{
|
|
unsigned high = output[i];
|
|
high >>= 8;
|
|
high &= 0xFF;
|
|
unsigned high2 = expected_output[2*i+1];
|
|
BOOST_CHECK(high == high2);
|
|
}
|
|
}
|
|
|
|
string ref = boost::to_8_bit(output, facet);
|
|
|
|
BOOST_CHECK(ref == input);
|
|
}
|
|
|
|
int main(int ac, char* av[])
|
|
{
|
|
std::string input = file_content("utf8.txt");
|
|
std::string expected = file_content("ucs2.txt");
|
|
|
|
test_convert(input, expected);
|
|
|
|
if (ac > 1) {
|
|
cout << "Trying to convert the command line argument\n";
|
|
|
|
locale::global(locale(""));
|
|
std::wstring w = boost::from_local_8_bit(av[1]);
|
|
|
|
cout << "Got something, printing decimal code point values\n";
|
|
for (unsigned i = 0; i < w.size(); ++i) {
|
|
cout << (unsigned)w[i] << "\n";
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
}
|