From 19a730b78b8aa3a38a468f927567a4d94d4f7f2c Mon Sep 17 00:00:00 2001 From: ELynx Date: Mon, 27 Jun 2016 11:52:22 +0300 Subject: [PATCH] Basic support for Unicode 'slash'uABCD escape notation - parser understands escape sequence and following data --- .../chaiscript/language/chaiscript_parser.hpp | 64 +++++++++++++++++-- 1 file changed, 60 insertions(+), 4 deletions(-) diff --git a/include/chaiscript/language/chaiscript_parser.hpp b/include/chaiscript/language/chaiscript_parser.hpp index 1ea7371..4104278 100644 --- a/include/chaiscript/language/chaiscript_parser.hpp +++ b/include/chaiscript/language/chaiscript_parser.hpp @@ -928,6 +928,29 @@ namespace chaiscript return false; } + // Generic for u16, u32 and (probably) wchar + template + static string_type str_from_ll(long long val) + { + return string_type(1, string_type::value_type(val)); //size, character + } + + // Specialization for char + template<> + static std::string str_from_ll(long long val) + { + std::string::value_type c[2]; + c[1] = val; + c[0] = val >> 8; + + if (c[0] == 0) + { + return std::string(1, c[1]); //size, character + } + + return std::string(c, 2); //char buffer, size + } + template struct Char_Parser { @@ -938,6 +961,7 @@ namespace chaiscript bool saw_interpolation_marker; bool is_octal; bool is_hex; + bool is_unicode; const bool interpolation_allowed; string_type octal_matches; @@ -950,6 +974,7 @@ namespace chaiscript saw_interpolation_marker(false), is_octal(false), is_hex(false), + is_unicode(false), interpolation_allowed(t_interpolation_allowed) { } @@ -964,6 +989,10 @@ namespace chaiscript if (is_hex) { process_hex(); } + + if (is_unicode) { + process_unicode(); + } } void process_hex() @@ -985,9 +1014,23 @@ namespace chaiscript is_octal = false; } + + void process_unicode() + { + auto val = stoll(hex_matches, 0, 16); + hex_matches.clear(); + match += str_from_ll(val); + is_escaped = false; + is_unicode = false; + } + void parse(const char_type t_char, const int line, const int col, const std::string &filename) { const bool is_octal_char = t_char >= '0' && t_char <= '7'; + const bool is_hex_char = (t_char >= '0' && t_char <= '9') + || (t_char >= 'a' && t_char <= 'f') + || (t_char >= 'A' && t_char <= 'F'); + if (is_octal) { if (is_octal_char) { octal_matches.push_back(t_char); @@ -1000,10 +1043,6 @@ namespace chaiscript process_octal(); } } else if (is_hex) { - const bool is_hex_char = (t_char >= '0' && t_char <= '9') - || (t_char >= 'a' && t_char <= 'f') - || (t_char >= 'A' && t_char <= 'F'); - if (is_hex_char) { hex_matches.push_back(t_char); @@ -1018,6 +1057,21 @@ namespace chaiscript } else { process_hex(); } + } else if (is_unicode) { + if (is_hex_char) { + hex_matches.push_back(t_char); + + if(hex_matches.size() == 4) { + // Format is specified to be 'slash'uABCD + // on collecting from A to D do parsing + process_unicode(); + } + return; + } else { + // Not a unicode anymore, try parsing any way + // May be someone used 'slash'uAA only + process_unicode(); + } } if (t_char == '\\') { @@ -1034,6 +1088,8 @@ namespace chaiscript octal_matches.push_back(t_char); } else if (t_char == 'x') { is_hex = true; + } else if (t_char == 'u') { + is_unicode = true; } else { switch (t_char) { case ('\'') : match.push_back('\''); break;