Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 75 additions & 29 deletions Lib/email/charset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
'add_codec',
]

import codecs
from functools import partial

import email.base64mime
Expand Down Expand Up @@ -58,37 +59,71 @@
'shift_jis': (BASE64, None, 'iso-2022-jp'),
'iso-2022-jp': (BASE64, None, None),
'koi8-r': (BASE64, BASE64, None),
'utf-8': (SHORTEST, BASE64, 'utf-8'),
}

# Aliases for other commonly-used names for character sets. Map
# them to the real ones used in email.
# Map Python codec names to their corresponding MIME/IANA names.
ALIASES = {
'latin_1': 'iso-8859-1',
'latin-1': 'iso-8859-1',
'latin_2': 'iso-8859-2',
'latin-2': 'iso-8859-2',
'latin_3': 'iso-8859-3',
'latin-3': 'iso-8859-3',
'latin_4': 'iso-8859-4',
'latin-4': 'iso-8859-4',
'latin_5': 'iso-8859-9',
'latin-5': 'iso-8859-9',
'latin_6': 'iso-8859-10',
'latin-6': 'iso-8859-10',
'latin_7': 'iso-8859-13',
'latin-7': 'iso-8859-13',
'latin_8': 'iso-8859-14',
'latin-8': 'iso-8859-14',
'latin_9': 'iso-8859-15',
'latin-9': 'iso-8859-15',
'latin_10':'iso-8859-16',
'latin-10':'iso-8859-16',
'cp949': 'ks_c_5601-1987',
'euc_jp': 'euc-jp',
'euc_kr': 'euc-kr',
'ascii': 'us-ascii',
}
'ascii': 'us-ascii',
'big5hkscs': 'big5-hkscs',
'cp037': 'ibm037',
'cp1026': 'ibm1026',
'cp1140': 'ibm01140',
'cp1250': 'windows-1250',
'cp1251': 'windows-1251',
'cp1252': 'windows-1252',
'cp1253': 'windows-1253',
'cp1254': 'windows-1254',
'cp1255': 'windows-1255',
'cp1256': 'windows-1256',
'cp1257': 'windows-1257',
'cp1258': 'windows-1258',
'cp273': 'ibm273',
'cp424': 'ibm424',
'cp437': 'ibm437',
'cp500': 'ibm500',
'cp775': 'ibm775',
'cp850': 'ibm850',
'cp852': 'ibm852',
'cp855': 'ibm855',
'cp857': 'ibm857',
'cp858': 'ibm00858',
'cp860': 'ibm860',
'cp861': 'ibm861',
'cp862': 'ibm862',
'cp863': 'ibm863',
'cp864': 'ibm864',
'cp865': 'ibm865',
'cp866': 'ibm866',
'cp869': 'ibm869',
'cp874': 'windows-874',
'euc_jp': 'euc-jp',
'euc_kr': 'euc-kr',
'hz': 'hz-gb-2312',
'iso2022_jp': 'iso-2022-jp',
'iso2022_jp_2': 'iso-2022-jp-2',
'iso2022_kr': 'iso-2022-kr',
'iso8859-1': 'iso-8859-1',
'iso8859-10': 'iso-8859-10',
'iso8859-11': 'iso-8859-11',
'iso8859-13': 'iso-8859-13',
'iso8859-14': 'iso-8859-14',
'iso8859-15': 'iso-8859-15',
'iso8859-16': 'iso-8859-16',
'iso8859-2': 'iso-8859-2',
'iso8859-3': 'iso-8859-3',
'iso8859-4': 'iso-8859-4',
'iso8859-5': 'iso-8859-5',
'iso8859-6': 'iso-8859-6',
'iso8859-7': 'iso-8859-7',
'iso8859-8': 'iso-8859-8-i',
'iso8859-9': 'iso-8859-9',
'kz1048': 'kz-1048',
'mac-roman': 'macintosh',

# CP949 is not registered in IANA. KS_C_5601-1987 is not the same,
# but the closest registered option.
'cp949': 'ks_c_5601-1987',
}


# Map charsets to their Unicode codec strings.
Expand Down Expand Up @@ -215,7 +250,18 @@ def __init__(self, input_charset=DEFAULT_CHARSET):
raise errors.CharsetError(input_charset)
input_charset = input_charset.lower()
# Set the input charset after filtering through the aliases
self.input_charset = ALIASES.get(input_charset, input_charset)
# For backward compatibility, try ALIASES first to let the user
# override it.
if input_charset in ALIASES:
input_charset = ALIASES[input_charset]
else:
try:
input_codec = codecs.lookup(input_charset).name
except LookupError:
pass
else:
input_charset = ALIASES.get(input_codec, input_codec)
self.input_charset = input_charset
# We can try to guess which encoding and conversion to use by the
# charset_map dictionary. Try that first, but let the user override
# it.
Expand Down
6 changes: 3 additions & 3 deletions Lib/email/contentmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,11 +173,11 @@ def set_text_content(msg, string, subtype="plain", charset='utf-8', cte=None,
disposition=None, filename=None, cid=None,
params=None, headers=None):
_prepare_set(msg, 'text', subtype, headers)

charset = email.charset.Charset(charset).input_charset
cte, payload = _encode_text(string, charset, cte, msg.policy)
msg.set_payload(payload)
msg.set_param('charset',
email.charset.ALIASES.get(charset, charset),
replace=True)
msg.set_param('charset', charset, replace=True)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your reordering of the operations here looks correct, which presumably means there is a missing test that would show the bug. Do you want to add one? If not I'll make a note and try to remember to do it some day ;)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added test_set_text_charset_cp949. Note that charset="euc-kr", even if ALIASES maps 'cp949' to 'ks_c_5601-1987'.

But when I tried to add similar test with shif_jis or euc-jp, it failed (trying to encode surrogates). It fails also with the current code.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened #150771.

msg['Content-Transfer-Encoding'] = cte
_finalize_set(msg, disposition, filename, cid, params)
raw_data_manager.add_set_handler(str, set_text_content)
Expand Down
6 changes: 3 additions & 3 deletions Lib/test/test_email/test_asian_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,15 @@ def test_chinese_codecs(self):
h.append(s, Charset('big5hkscs'))
eq(h.encode(), """\
Chinese =?gb2312?b?1tDOxA==?= =?gbk?b?1tDOxA==?= =?gb18030?b?1tDOxA==?=
=?hz?b?fntWUE5Efn0=?= =?big5?b?pKSk5Q==?= =?big5hkscs?b?pKSk5Q==?=""")
=?hz-gb-2312?b?fntWUE5Efn0=?= =?big5?b?pKSk5Q==?= =?big5-hkscs?b?pKSk5Q==?=""")
eq(decode_header(h.encode()),
[(b'Chinese ', None),
(b'\xd6\xd0\xce\xc4', 'gb2312'),
(b'\xd6\xd0\xce\xc4', 'gbk'),
(b'\xd6\xd0\xce\xc4', 'gb18030'),
(b'~{VPND~}', 'hz'),
(b'~{VPND~}', 'hz-gb-2312'),
(b'\xa4\xa4\xa4\xe5', 'big5'),
(b'\xa4\xa4\xa4\xe5', 'big5hkscs'),
(b'\xa4\xa4\xa4\xe5', 'big5-hkscs'),
])

def test_korean_codecs(self):
Expand Down
13 changes: 13 additions & 0 deletions Lib/test/test_email/test_contentmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,19 @@ def test_set_text_charset_latin_1(self):
self.assertEqual(m.get_payload(decode=True).decode('utf-8'), content)
self.assertEqual(m.get_content(), content)

def test_set_text_charset_cp949(self):
m = self._make_message()
content = "\ud55c\uad6d\uc5b4\n\uac02\n"
raw_data_manager.set_content(m, content, charset='cp949')
self.assertEqual(str(m), textwrap.dedent("""\
Content-Type: text/plain; charset="ks_c_5601-1987"
Content-Transfer-Encoding: base64

x9Gxub7uCoFBCg==
"""))
self.assertEqual(m.get_payload(decode=True).decode('ks_c_5601-1987'), content)
self.assertEqual(m.get_content(), content)

def test_set_text_plain_long_line_heuristics(self):
m = self._make_message()
content = ("Simple but long message that is over 78 characters"
Expand Down
127 changes: 127 additions & 0 deletions Lib/test/test_email/test_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -4970,6 +4970,128 @@ def tearDown(self):
except KeyError:
pass

def test_attributes(self):
from email import charset
c = Charset()
self.assertEqual(c.input_charset, 'us-ascii')
self.assertEqual(c.header_encoding, None)
self.assertEqual(c.body_encoding, None)
self.assertEqual(c.output_charset, 'us-ascii')
self.assertEqual(c.input_codec, None)
self.assertEqual(c.output_codec, None)

c = Charset('us-ascii')
self.assertEqual(c.input_charset, 'us-ascii')
self.assertEqual(c.header_encoding, None)
self.assertEqual(c.body_encoding, None)
self.assertEqual(c.output_charset, 'us-ascii')
self.assertEqual(c.input_codec, None)
self.assertEqual(c.output_codec, None)

c = Charset('utf8')
self.assertEqual(c.input_charset, 'utf-8')
self.assertEqual(c.header_encoding, charset.SHORTEST)
self.assertEqual(c.body_encoding, charset.BASE64)
self.assertEqual(c.output_charset, 'utf-8')
self.assertEqual(c.input_codec, 'utf-8')
self.assertEqual(c.output_codec, 'utf-8')

c = Charset('latin1')
self.assertEqual(c.input_charset, 'iso-8859-1')
self.assertEqual(c.header_encoding, charset.QP)
self.assertEqual(c.body_encoding, charset.QP)
self.assertEqual(c.output_charset, 'iso-8859-1')
self.assertEqual(c.input_codec, 'iso-8859-1')
self.assertEqual(c.output_codec, 'iso-8859-1')

c = Charset('latin9')
self.assertEqual(c.input_charset, 'iso-8859-15')
self.assertEqual(c.header_encoding, charset.QP)
self.assertEqual(c.body_encoding, charset.QP)
self.assertEqual(c.output_charset, 'iso-8859-15')
self.assertEqual(c.input_codec, 'iso-8859-15')
self.assertEqual(c.output_codec, 'iso-8859-15')

c = Charset('cyrillic')
self.assertEqual(c.input_charset, 'iso-8859-5')
self.assertEqual(c.header_encoding, charset.SHORTEST)
self.assertEqual(c.body_encoding, charset.BASE64)
self.assertEqual(c.output_charset, 'iso-8859-5')
self.assertEqual(c.input_codec, 'iso-8859-5')
self.assertEqual(c.output_codec, 'iso-8859-5')

c = Charset('cp1251')
self.assertEqual(c.input_charset, 'windows-1251')
self.assertEqual(c.header_encoding, charset.SHORTEST)
self.assertEqual(c.body_encoding, charset.BASE64)
self.assertEqual(c.output_charset, 'windows-1251')
self.assertEqual(c.input_codec, 'windows-1251')
self.assertEqual(c.output_codec, 'windows-1251')

c = Charset('cp1252')
self.assertEqual(c.input_charset, 'windows-1252')
self.assertEqual(c.header_encoding, charset.QP)
self.assertEqual(c.body_encoding, charset.QP)
self.assertEqual(c.output_charset, 'windows-1252')
self.assertEqual(c.input_codec, 'windows-1252')
self.assertEqual(c.output_codec, 'windows-1252')

c = Charset('eucjp')
self.assertEqual(c.input_charset, 'euc-jp')
self.assertEqual(c.header_encoding, charset.BASE64)
self.assertEqual(c.body_encoding, None)
self.assertEqual(c.output_charset, 'iso-2022-jp')
self.assertEqual(c.input_codec, 'euc-jp')
self.assertEqual(c.output_codec, 'iso-2022-jp')

c = Charset('cp949')
self.assertEqual(c.input_charset, 'ks_c_5601-1987')
self.assertEqual(c.header_encoding, charset.SHORTEST)
self.assertEqual(c.body_encoding, charset.BASE64)
self.assertEqual(c.output_charset, 'ks_c_5601-1987')
self.assertEqual(c.input_codec, 'ks_c_5601-1987')
self.assertEqual(c.output_codec, 'ks_c_5601-1987')

c = Charset('gb2312')
self.assertEqual(c.input_charset, 'gb2312')
self.assertEqual(c.header_encoding, charset.BASE64)
self.assertEqual(c.body_encoding, charset.BASE64)
self.assertEqual(c.output_charset, 'gb2312')
self.assertEqual(c.input_codec, 'gb2312')
self.assertEqual(c.output_codec, 'gb2312')

c = Charset('big5')
self.assertEqual(c.input_charset, 'big5')
self.assertEqual(c.header_encoding, charset.BASE64)
self.assertEqual(c.body_encoding, charset.BASE64)
self.assertEqual(c.output_charset, 'big5')
self.assertEqual(c.input_codec, 'big5')
self.assertEqual(c.output_codec, 'big5')

def test_user_charsets(self):
from email import charset
c = Charset('fake0')
self.assertEqual(c.input_charset, 'fake0')
self.assertEqual(c.header_encoding, charset.SHORTEST)
self.assertEqual(c.body_encoding, charset.BASE64)
self.assertEqual(c.output_charset, 'fake0')
self.assertEqual(c.input_codec, 'fake0')
self.assertEqual(c.output_codec, 'fake0')

charset.add_alias('fake1', 'mime-fake')
charset.add_alias('output-mime-fake', 'output-mime-fake-alias')
charset.add_codec('mime-fake', 'fakecodec')
charset.add_codec('output-mime-fake-alias', 'outputfakecodec')
charset.add_charset('mime-fake', charset.QP, None, 'output-mime-fake')

c = Charset('fake1')
self.assertEqual(c.input_charset, 'mime-fake')
self.assertEqual(c.header_encoding, charset.QP)
self.assertEqual(c.body_encoding, None)
self.assertEqual(c.output_charset, 'output-mime-fake-alias')
self.assertEqual(c.input_codec, 'fakecodec')
self.assertEqual(c.output_codec, 'outputfakecodec')
Comment thread
bitdancer marked this conversation as resolved.

def test_codec_encodeable(self):
eq = self.assertEqual
# Make sure us-ascii = no Unicode conversion
Expand Down Expand Up @@ -5010,6 +5132,11 @@ def test_unicode_charset_name(self):
self.assertEqual(str(charset), 'us-ascii')
self.assertRaises(errors.CharsetError, Charset, 'asc\xffii')

def test_bytes_charset_name(self):
charset = Charset(b'us-ascii')
self.assertEqual(str(charset), 'us-ascii')
self.assertRaises(errors.CharsetError, Charset, b'asc\xffii')



# Test multilingual MIME headers.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
The :mod:`email` package now supports all aliases of Python codecs and uses
MIME/IANA names for all IANA registered charsets.
Loading