Implement MailMessage::decode #1543 (tentatively done); add encode 'B', decode 'Q' and 'B'

This commit is contained in:
Alex Fabijanic 2017-11-07 15:42:14 -06:00
parent 64becc6266
commit 7564c7bed2
3 changed files with 340 additions and 119 deletions

View File

@ -237,9 +237,9 @@ public:
void write(std::ostream& ostr) const;
/// Writes the mail message to the given output stream.
static std::string encodeWord(const std::string& text, const std::string& charset = "UTF-8");
static std::string encodeWord(const std::string& text, const std::string& charset = "UTF-8", char encoding = 'q');
/// If the given string contains non-ASCII characters,
/// encodes the given string using RFC 2047 "Q" word encoding.
/// encodes the given string using RFC 2047 'Q' or 'B' word encoding.
///
/// The given text must already be encoded in the character set
/// given in charset (default is UTF-8).
@ -247,11 +247,14 @@ public:
/// Returns the encoded string, or the original string if it
/// consists only of ASCII characters.
static MailMessage decodeWords(const MailMessage& msg);
/// Decodes a MailMessage
static std::string decodeWord(const std::string& encodedWord, std::string toCharset = "");
/// Decodes an encoded-word.
/// Decodes a string containing encoded-word's according to the rules specified in
/// RFC 2047 and returns the decoded string. Both Q and B encodings are supported.
///
/// If toCharset is not provided, no decoded string conversion is performed (ie.
/// string is simply decoded to the charset specified in encodedWord string)
/// If toCharset is provided, returned string is converted to the specified
/// charset. For a list of supported encodings, see Poco:TextEncodingRegistry.
static const std::string HEADER_SUBJECT;
static const std::string HEADER_FROM;
@ -289,7 +292,9 @@ protected:
static std::string decodeWord(const std::string& charset, char encoding,
const std::string& text, const std::string& toCharset);
static void getEncWordLimits(const std::string& encodedWord,
std::string::size_type& pos1, std::string::size_type& pos2);
std::string::size_type& pos1, std::string::size_type& pos2, bool isComment);
static void advanceToEncoded(const std::string& encoded, std::string& decoded,
std::string::size_type& pos1, bool& isComment);
private:
MailMessage(const MailMessage&);

View File

@ -662,142 +662,227 @@ void MailMessage::appendRecipient(const MailRecipient& recipient, std::string& s
}
std::string MailMessage::encodeWord(const std::string& text, const std::string& charset)
void encodeQ(std::string& encodedText, std::string::const_iterator it, std::string::size_type& lineLength)
{
bool containsNonASCII = false;
for (std::string::const_iterator it = text.begin(); it != text.end(); ++it)
switch (*it)
{
if (static_cast<unsigned char>(*it) > 127)
case ' ':
encodedText += '_';
lineLength++;
break;
case '=':
case '?':
case '_':
case '(':
case ')':
case '[':
case ']':
case '<':
case '>':
case ',':
case ';':
case ':':
case '.':
case '@':
encodedText += '=';
NumberFormatter::appendHex(encodedText, static_cast<unsigned>(static_cast<unsigned char>(*it)), 2);
lineLength += 3;
break;
default:
if (*it > 32 && *it < 127)
{
containsNonASCII = true;
break;
}
}
if (!containsNonASCII) return text;
std::string encodedText;
std::string::size_type lineLength = 0;
for (std::string::const_iterator it = text.begin(); it != text.end(); ++it)
{
if (lineLength == 0)
{
encodedText += "=?";
encodedText += charset;
encodedText += "?q?";
lineLength += charset.length() + 5;
}
switch (*it)
{
case ' ':
encodedText += '_';
encodedText += *it;
lineLength++;
break;
case '=':
case '?':
case '_':
case '(':
case ')':
case '[':
case ']':
case '<':
case '>':
case ',':
case ';':
case ':':
case '.':
case '@':
}
else
{
encodedText += '=';
NumberFormatter::appendHex(encodedText, static_cast<unsigned>(static_cast<unsigned char>(*it)), 2);
lineLength += 3;
break;
default:
if (*it > 32 && *it < 127)
{
encodedText += *it;
lineLength++;
}
else
{
encodedText += '=';
NumberFormatter::appendHex(encodedText, static_cast<unsigned>(static_cast<unsigned char>(*it)), 2);
lineLength += 3;
}
}
if ((lineLength >= 64 && (*it == ' ' || *it == '\t' || *it == '\r' || *it == '\n')) || lineLength >= 72)
{
encodedText += "?=\r\n ";
lineLength = 0;
}
}
if (lineLength > 0)
}
void startEncoding(std::string& encodedText, const std::string& charset, char encoding)
{
encodedText += "=?";
encodedText += charset;
encodedText += '?';
encodedText += encoding;
encodedText += '?';
}
std::string MailMessage::encodeWord(const std::string& text, const std::string& charset, char encoding)
{
if (encoding == 'q' || encoding == 'Q')
{
encodedText += "?=";
}
bool containsNonASCII = false;
for (std::string::const_iterator it = text.begin(); it != text.end(); ++it)
{
if (static_cast<unsigned char>(*it) > 127)
{
containsNonASCII = true;
break;
}
}
if (!containsNonASCII) return text;
}
std::string encodedText;
std::string::size_type lineLength = 0;
if (encoding == 'q' || encoding == 'Q')
{
for (std::string::const_iterator it = text.begin(); it != text.end(); ++it)
{
if (lineLength == 0)
{
startEncoding(encodedText, charset, encoding);
lineLength += charset.length() + 5;
}
encodeQ(encodedText, it, lineLength);
if ((lineLength >= 64 &&
(*it == ' ' || *it == '\t' || *it == '\r' || *it == '\n')) ||
lineLength >= 72)
{
encodedText += "?=\r\n ";
lineLength = 0;
}
}
}
else if (encoding == 'b' || encoding == 'B')
{
// to ensure we're under 75 chars, 4 padding chars are always predicted
lineLength = 75 - (charset.length() + 5/*=??B?*/ + 2/*?=*/ + 4/*base64 padding*/);
std::string::size_type pos = 0;
size_t textLen = static_cast<size_t>(floor(lineLength * 3 / 4));
std::ostringstream ostr;
while (true)
{
Base64Encoder encoder(ostr);
encoder.rdbuf()->setLineLength(static_cast<int>(lineLength));
startEncoding(encodedText, charset, encoding);
std::string line = text.substr(pos, textLen);
encoder << line;
encoder.close();
encodedText.append(ostr.str());
encodedText.append("?=");
if (line.size() < textLen) break;
ostr.str("");
pos += textLen;
encodedText.append("\r\n");
}
lineLength = 0;;
}
else
{
throw InvalidArgumentException(Poco::format("MailMessage::encodeWord: "
"unknown encoding: %c", encoding));
}
if (lineLength > 0) encodedText += "?=";
return encodedText;
}
MailMessage MailMessage::decodeWords(const MailMessage& msg)
void MailMessage::advanceToEncoded(const std::string& encoded, std::string& decoded, std::string::size_type& pos1, bool& isComment)
{
MailMessage decodedMsg;
for (const auto& item : msg)
bool spaceOnly = isComment; // flag to trim away spaces between encoded-word's
auto it = encoded.begin();
auto end = encoded.end();
for (; it != end; ++it)
{
decodedMsg.set(item.first, decodeWord(item.second));
if (*it == '=')
{
if (++it != end && *it == '?')
{
if (spaceOnly) trimRightInPlace(decoded);
return;
}
}
else if (*it == '(') isComment = true;
else if (*it == ')') isComment = false;
if ((isComment) && (!Ascii::isSpace(*it))) spaceOnly = false;
decoded.append(1, *it);
++pos1;
}
return std::move(decodedMsg);
pos1 = std::string::npos;
}
std::string MailMessage::decodeWord(const std::string& encodedWord, std::string toCharset)
std::string MailMessage::decodeWord(const std::string& encoded, std::string toCharset)
{
std::string encodedWord = replace(encoded, "?=\r\n=?", "?==?");
bool toCharsetGiven = !toCharset.empty();
std::string errMsg;
const std::size_t notFound = std::string::npos;
std::string decoded;
std::string::size_type pos1, pos2;
getEncWordLimits(encodedWord, pos1, pos2);
while ((pos1 != notFound) && (pos2 != notFound) && pos2 > pos1 + 2)
std::string::size_type pos1 = 0, pos2 = 0;
bool isComment = false;
advanceToEncoded(encodedWord, decoded, pos1, isComment);
if (pos1 != notFound)
{
pos1 += 2;
StringTokenizer st(encodedWord.substr(pos1, pos2 - pos1), "?");
if (st.count() == 3)
getEncWordLimits(encodedWord, pos1, pos2, isComment);
while ((pos1 != notFound) && (pos2 != notFound) && pos2 > pos1 + 2)
{
std::string charset = st[0];
if (toCharset.empty()) toCharset = charset;
if (st[1].size() > 1)
pos1 += 2;
StringTokenizer st(encodedWord.substr(pos1, pos2 - pos1), "?");
if (st.count() == 3)
{
std::string charset = st[0];
if (!toCharsetGiven) toCharset = charset;
if (st[1].size() > 1)
{
throw InvalidArgumentException(Poco::format("MailMessage::decodeWord: "
"invalid encoding %s", st[1]));
}
char encoding = st[1][0];
std::string encodedText = st[2];
if (encodedText.find_first_of(" ?") != notFound)
{
throw InvalidArgumentException("MailMessage::decodeWord: "
"forbidden characters found in encoded-word");
}
else if (encoding == 'q' || encoding == 'Q')
{
// no incomplete encoded characters allowed on single line
std::string::size_type eqPos = encodedText.rfind('=');
if (eqPos != notFound)
{
if ((eqPos + 2) >= encodedText.size())
{
throw InvalidArgumentException("MailMessage::decodeWord: "
"incomplete encoded character found in encoded-word");
}
}
}
decoded.append(decodeWord(charset, encoding, encodedText, toCharset));
pos1 = pos2 + 2;
advanceToEncoded(encodedWord.substr(pos1), decoded, pos1, isComment);
if (pos1 != notFound) getEncWordLimits(encodedWord, pos1, pos2, isComment);
}
else
{
throw InvalidArgumentException(Poco::format("MailMessage::decodeWord: "
"invalid encoding %s", st[1]));
"invalid number of entries in encoded-word (expected 3, found %z)", st.count()));
}
char encoding = st[1][0];
std::string encodedText = st[2];
if (encodedText.find_first_of(" ?") != notFound)
{
throw InvalidArgumentException("MailMessage::decodeWord: "
"forbidden characters found in encoded-word");
}
decoded.append(decodeWord(charset, encoding, encodedText, toCharset));
pos1 = pos2 + 2;
pos1 = encodedWord.find("=?", pos1);
if (pos1 != notFound) pos2 = encodedWord.find("?=", pos1);
}
else
{
throw InvalidArgumentException(Poco::format("MailMessage::decodeWord: "
"invalid number of entries in encoded-word (expected 3, found %z)", st.count()));
}
}
else decoded = std::move(encodedWord);
return decoded;
}
void MailMessage::getEncWordLimits(const std::string& encodedWord, std::string::size_type& pos1, std::string::size_type& pos2)
void MailMessage::getEncWordLimits(const std::string& encodedWord, std::string::size_type& pos1, std::string::size_type& pos2, bool isComment)
{
const std::size_t notFound = std::string::npos;
pos1 = encodedWord.find("=?"); // beginning of encoded-word
pos1 = encodedWord.find("=?", pos1); // beginning of encoded-word
if (pos1 != notFound)
{
// must look for all '?' occurences because of a case like this:
// must look sequentially for all '?' occurences because of a (valid) case like this:
// =?ISO-8859-1?q?=C4?=
// where end would be prematurely found if we search for ?= only
pos2 = encodedWord.find('?', pos1 + 2); // first '?'
@ -805,15 +890,29 @@ void MailMessage::getEncWordLimits(const std::string& encodedWord, std::string::
pos2 = encodedWord.find('?', pos2 + 1); // second '?'
if (pos2 == notFound) goto err;
pos2 = encodedWord.find("?=", pos2 + 1); // end of encoded-word
if (pos2 == notFound) goto err;
// before we leave, double-check for the next encoded-word end, to make sure
// an illegal '?' was not sneaked in (eg. =?ISO-8859-1?q?=C4?=D6?=)
if (((encodedWord.find("?=", pos2 + 1) != notFound &&
encodedWord.find("=?", pos2 + 1) == notFound)) ||
((encodedWord.find("=?", pos2 + 1) != notFound &&
encodedWord.find("?=", pos2 + 1) == notFound))) goto err;
}
else goto err;
// if encoded word is in a comment, then '(' and ')' are forbidden inside it
if (isComment &&
(notFound != encodedWord.substr(pos1, pos2 - pos1).find_first_of("()"))) goto err;
return;
err:
throw InvalidArgumentException("MailMessage::encodedWordLimits: invalid encoded word");
}
std::string MailMessage::decodeWord(const std::string& charset,
char encoding, const std::string& text, const std::string& toCharset)
std::string MailMessage::decodeWord(const std::string& charset, char encoding,
const std::string& text, const std::string& toCharset)
{
const TextEncodingRegistry& registry = TextEncoding::registry();
if (!registry.has(charset) || !registry.has(toCharset))
@ -828,7 +927,13 @@ std::string MailMessage::decodeWord(const std::string& charset,
switch (encoding)
{
case 'B': case 'b':
{
std::istringstream istr(text);
Base64Decoder decoder(istr);
int c = decoder.get();
while (c != -1) { decoded.append(1, char(c)); c = decoder.get(); }
break;
}
case 'Q': case 'q':
{
bool isWide = false;
@ -836,6 +941,11 @@ std::string MailMessage::decodeWord(const std::string& charset,
std::vector<unsigned char> wideCharSeq;
for (const auto& c : text)
{
if (!Ascii::isPrintable(c) || c == '?' || c == ' ')
{
throw InvalidArgumentException("MailMessage::decodeWord: encoded-word must not contain "
"non-printable characters, '? or SPACE");
}
if (c == '_') decoded.append(1, ' ');
else if (c == '=') isWide = true;
else if (isWide)

View File

@ -665,6 +665,28 @@ void MailMessageTest::testEncodeWord()
" =?ISO-8859-1?q?is_also_a_very_long_text=2E_Longer_than_75_characters=2E_?=\r\n"
" =?ISO-8859-1?q?Long_enough_to_become_three_lines_after_being_word-encode?=\r\n"
" =?ISO-8859-1?q?d=2E?=");
plain = "If you can read this yo";
encoded = MailMessage::encodeWord(plain, "ISO-8859-1", 'B');
assert(encoded == "=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=");
plain = "u understand the example.";
encoded = MailMessage::encodeWord(plain, "ISO-8859-2", 'B');
assert(encoded == "=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=");
plain = "This text contains German Umlauts: \xC4\xD6. "
"It is also a very long text. Longer than 75 "
"characters. Long enough to become three lines "
"after being word-encoded.";
encoded = MailMessage::encodeWord(plain, "ISO-8859-1", 'B');
assert(encoded ==
"=?ISO-8859-1?B?VGhpcyB0ZXh0IGNvbnRhaW5zIEdlcm1hbiBVbWxhdXRzOiDE1i4gSQ==?=\r\n"
"=?ISO-8859-1?B?dCBpcyBhbHNvIGEgdmVyeSBsb25nIHRleHQuIExvbmdlciB0aGFuIA==?=\r\n"
"=?ISO-8859-1?B?NzUgY2hhcmFjdGVycy4gTG9uZyBlbm91Z2ggdG8gYmVjb21lIHRocg==?=\r\n"
"=?ISO-8859-1?B?ZWUgbGluZXMgYWZ0ZXIgYmVpbmcgd29yZC1lbmNvZGVkLg==?=");
std::string decoded = MailMessage::decodeWord(encoded);
assert (decoded == plain);
}
@ -674,8 +696,30 @@ void MailMessageTest::testDecodeWord()
std::string decoded = MailMessage::decodeWord(encoded);
assert(decoded == "\xC4\xD6");
encoded = "=?ISO-8859-1?q?=C4=D6?=abc=?ISO-8859-1?q?=C4=D6?=";
decoded = MailMessage::decodeWord(encoded);
assert(decoded == "\xC4\xD6" "abc" "\xC4\xD6");
decoded = MailMessage::decodeWord(encoded, "UTF-8");
assert(decoded == "\xC3\x84\xC3\x96");
assert(decoded == "\xC3\x84\xC3\x96" "abc" "\xC3\x84\xC3\x96");
try
{
MailMessage::decodeWord("=?ISO-8859-1?q?=C4 =D6?=");
fail("must fail");
}
catch (Poco::InvalidArgumentException&) {}
try
{
MailMessage::decodeWord("=?ISO-8859-1?q?=C4?=D6?=\r\n");
fail("must fail");
}
catch (Poco::InvalidArgumentException&) {}
encoded = "=?ISO-8859-1?q?=C4=D6_It_?=\r\n=?ISO-8859-1?q?is?=";
decoded = MailMessage::decodeWord(encoded);
assert(decoded == "\xC4\xD6 It is");
encoded = "=?ISO-8859-1?q?This_text_contains_German_Umlauts=3A_=C4=D6?=";
decoded = MailMessage::decodeWord(encoded);
@ -685,25 +729,87 @@ void MailMessageTest::testDecodeWord()
assert(decoded == "This text contains German Umlauts: \xC3\x84\xC3\x96");
encoded = "=?ISO-8859-1?q?This_text_contains_German_Umlauts=3A_=C4=D6=2E_It_?=\r\n"
" =?ISO-8859-1?q?is_also_a_very_long_text=2E_Longer_than_75_characters=2E_?=\r\n"
" =?ISO-8859-1?q?Long_enough_to_become_three_lines_after_being_word-encode?=\r\n"
" =?ISO-8859-1?q?d=2E?=";
"=?ISO-8859-1?q?is_also_a_very_long_text=2E_Longer_than_75_characters=2E_?=\r\n"
"=?ISO-8859-1?q?Long_enough_to_become_three_lines_after_being_word-encode?=\r\n"
"=?ISO-8859-1?q?d=2E?=";
decoded = MailMessage::decodeWord(encoded);
assert(decoded == "This text contains German Umlauts: \xC4\xD6. "
"It is also a very long text. Longer than 75 "
"characters. Long enough to become three lines "
"after being word-encoded.");
assert(decoded == "This text contains German Umlauts: \xC4\xD6. It "
"is also a very long text. Longer than 75 characters. "
"Long enough to become three lines after being word-encode"
"d.");
decoded = MailMessage::decodeWord(encoded, "UTF-8");
assert(decoded == "This text contains German Umlauts: \xC3\x84\xC3\x96. "
"It is also a very long text. Longer than 75 "
"characters. Long enough to become three lines "
"after being word-encoded.");
assert(decoded == "This text contains German Umlauts: \xC3\x84\xC3\x96. It "
"is also a very long text. Longer than 75 characters. "
"Long enough to become three lines after being word-encode"
"d.");
encoded = "=?ISO-8859-1?Q?=F8=E9?=";
decoded = MailMessage::decodeWord(encoded);
assert(decoded == "\xF8\xE9");
encoded = "From: =?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu>\r\n"
"To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>\r\n"
"CC: =?ISO-8859-1?Q?Andr=E9?=Pirard <PIRARD@vm1.ulg.ac.be>\r\n";
"CC: =?ISO-8859-1?Q?Andr=E9?= Pirard <PIRARD@vm1.ulg.ac.be>\r\n"
"Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=\r\n"
"=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=";
decoded = MailMessage::decodeWord(encoded);
assert(decoded == "From: Keith Moore <moore@cs.utk.edu>\r\n"
"To: Keld J\xF8rn Simonsen <keld@dkuug.dk>\r\n"
"CC: Andr\xE9 Pirard <PIRARD@vm1.ulg.ac.be>\r\n"
"Subject: If you can read this you understand the example.");
encoded = "From: =?ISO-8859-1?Q?Olle_J=E4rnefors?= <ojarnef@admin.kth.se>\r\n"
"To: ietf-822@dimacs.rutgers.edu, ojarnef@admin.kth.se\r\n"
"Subject: Time for ISO 10646?";
decoded = MailMessage::decodeWord(encoded);
assert(decoded == "From: Olle J\xE4rnefors <ojarnef@admin.kth.se>\r\n"
"To: ietf-822@dimacs.rutgers.edu, ojarnef@admin.kth.se\r\n"
"Subject: Time for ISO 10646?");
encoded = "To: Dave Crocker <dcrocker@mordor.stanford.edu>\r\n"
"Cc: ietf-822@dimacs.rutgers.edu, paf@comsol.se\r\n"
"From: =?ISO-8859-1?Q?Patrik_F=E4ltstr=F6m?= <paf@nada.kth.se>\r\n"
"Subject: Re: RFC-HDR care and feeding";
decoded = MailMessage::decodeWord(encoded);
assert(decoded == "To: Dave Crocker <dcrocker@mordor.stanford.edu>\r\n"
"Cc: ietf-822@dimacs.rutgers.edu, paf@comsol.se\r\n"
"From: Patrik F\xE4ltstr\xF6m <paf@nada.kth.se>\r\n"
"Subject: Re: RFC-HDR care and feeding");
// encoded chars cannot be broken between lines
try
{
encoded = "=?ISO-8859-1?Q?=?=\r\n=?ISO-8859-1?Q?AB?=";
decoded = MailMessage::decodeWord(encoded);
fail("must fail");
}
catch (Poco::InvalidArgumentException&) {}
// comments and spaces therein
encoded = "(=?ISO-8859-1?Q?a?=)";
decoded = MailMessage::decodeWord(encoded);
assert(decoded == "(a)");
encoded = "(=?ISO-8859-1?Q?a?= b =?ISO-8859-1?Q?c?=)";
decoded = MailMessage::decodeWord(encoded);
assert(decoded == "(a b c)");
encoded = "(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)";
decoded = MailMessage::decodeWord(encoded);
assert(decoded == "(ab)");
encoded = "(=?ISO-8859-1?Q?a?=\r\n\t=?ISO-8859-1?Q?b?=)";
decoded = MailMessage::decodeWord(encoded);
assert(decoded == "(ab)");
encoded = "(=?ISO-8859-1?Q?a_b?=)";
decoded = MailMessage::decodeWord(encoded);
assert(decoded == "(a b)");
encoded = "(=?ISO-8859-1?Q?a?= =?ISO-8859-2?Q?_b?=)";
decoded = MailMessage::decodeWord(encoded);
assert(decoded == "(a b)");
}