diff --git a/Lib/email/charset.py b/Lib/email/charset.py index c4b246455f86c64..5981791820e740c 100644 --- a/Lib/email/charset.py +++ b/Lib/email/charset.py @@ -9,6 +9,7 @@ 'add_codec', ] +import codecs from functools import partial import email.base64mime @@ -58,37 +59,71 @@ 'shift_jis': (BASE64, None, 'iso-2022-jp'), 'iso-2022-jp': (BASE64, None, None), 'koi8-r': (BASE64, BASE64, None), - 'utf-8': (SHORTEST, BASE64, 'utf-8'), } -# Aliases for other commonly-used names for character sets. Map -# them to the real ones used in email. +# Map Python codec names to their corresponding MIME/IANA names. ALIASES = { - 'latin_1': 'iso-8859-1', - 'latin-1': 'iso-8859-1', - 'latin_2': 'iso-8859-2', - 'latin-2': 'iso-8859-2', - 'latin_3': 'iso-8859-3', - 'latin-3': 'iso-8859-3', - 'latin_4': 'iso-8859-4', - 'latin-4': 'iso-8859-4', - 'latin_5': 'iso-8859-9', - 'latin-5': 'iso-8859-9', - 'latin_6': 'iso-8859-10', - 'latin-6': 'iso-8859-10', - 'latin_7': 'iso-8859-13', - 'latin-7': 'iso-8859-13', - 'latin_8': 'iso-8859-14', - 'latin-8': 'iso-8859-14', - 'latin_9': 'iso-8859-15', - 'latin-9': 'iso-8859-15', - 'latin_10':'iso-8859-16', - 'latin-10':'iso-8859-16', - 'cp949': 'ks_c_5601-1987', - 'euc_jp': 'euc-jp', - 'euc_kr': 'euc-kr', - 'ascii': 'us-ascii', - } + 'ascii': 'us-ascii', + 'big5hkscs': 'big5-hkscs', + 'cp037': 'ibm037', + 'cp1026': 'ibm1026', + 'cp1140': 'ibm01140', + 'cp1250': 'windows-1250', + 'cp1251': 'windows-1251', + 'cp1252': 'windows-1252', + 'cp1253': 'windows-1253', + 'cp1254': 'windows-1254', + 'cp1255': 'windows-1255', + 'cp1256': 'windows-1256', + 'cp1257': 'windows-1257', + 'cp1258': 'windows-1258', + 'cp273': 'ibm273', + 'cp424': 'ibm424', + 'cp437': 'ibm437', + 'cp500': 'ibm500', + 'cp775': 'ibm775', + 'cp850': 'ibm850', + 'cp852': 'ibm852', + 'cp855': 'ibm855', + 'cp857': 'ibm857', + 'cp858': 'ibm00858', + 'cp860': 'ibm860', + 'cp861': 'ibm861', + 'cp862': 'ibm862', + 'cp863': 'ibm863', + 'cp864': 'ibm864', + 'cp865': 'ibm865', + 'cp866': 'ibm866', + 'cp869': 'ibm869', + 'cp874': 'windows-874', + 'euc_jp': 'euc-jp', + 'euc_kr': 'euc-kr', + 'hz': 'hz-gb-2312', + 'iso2022_jp': 'iso-2022-jp', + 'iso2022_jp_2': 'iso-2022-jp-2', + 'iso2022_kr': 'iso-2022-kr', + 'iso8859-1': 'iso-8859-1', + 'iso8859-10': 'iso-8859-10', + 'iso8859-11': 'iso-8859-11', + 'iso8859-13': 'iso-8859-13', + 'iso8859-14': 'iso-8859-14', + 'iso8859-15': 'iso-8859-15', + 'iso8859-16': 'iso-8859-16', + 'iso8859-2': 'iso-8859-2', + 'iso8859-3': 'iso-8859-3', + 'iso8859-4': 'iso-8859-4', + 'iso8859-5': 'iso-8859-5', + 'iso8859-6': 'iso-8859-6', + 'iso8859-7': 'iso-8859-7', + 'iso8859-8': 'iso-8859-8-i', + 'iso8859-9': 'iso-8859-9', + 'kz1048': 'kz-1048', + 'mac-roman': 'macintosh', + + # CP949 is not registered in IANA. KS_C_5601-1987 is not the same, + # but the closest registered option. + 'cp949': 'ks_c_5601-1987', +} # Map charsets to their Unicode codec strings. @@ -215,7 +250,18 @@ def __init__(self, input_charset=DEFAULT_CHARSET): raise errors.CharsetError(input_charset) input_charset = input_charset.lower() # Set the input charset after filtering through the aliases - self.input_charset = ALIASES.get(input_charset, input_charset) + # For backward compatibility, try ALIASES first to let the user + # override it. + if input_charset in ALIASES: + input_charset = ALIASES[input_charset] + else: + try: + input_codec = codecs.lookup(input_charset).name + except LookupError: + pass + else: + input_charset = ALIASES.get(input_codec, input_codec) + self.input_charset = input_charset # We can try to guess which encoding and conversion to use by the # charset_map dictionary. Try that first, but let the user override # it. diff --git a/Lib/email/contentmanager.py b/Lib/email/contentmanager.py index 13fcb9787f1f320..faf2626bccce651 100644 --- a/Lib/email/contentmanager.py +++ b/Lib/email/contentmanager.py @@ -173,11 +173,11 @@ def set_text_content(msg, string, subtype="plain", charset='utf-8', cte=None, disposition=None, filename=None, cid=None, params=None, headers=None): _prepare_set(msg, 'text', subtype, headers) + + charset = email.charset.Charset(charset).input_charset cte, payload = _encode_text(string, charset, cte, msg.policy) msg.set_payload(payload) - msg.set_param('charset', - email.charset.ALIASES.get(charset, charset), - replace=True) + msg.set_param('charset', charset, replace=True) msg['Content-Transfer-Encoding'] = cte _finalize_set(msg, disposition, filename, cid, params) raw_data_manager.add_set_handler(str, set_text_content) diff --git a/Lib/test/test_email/test_asian_codecs.py b/Lib/test/test_email/test_asian_codecs.py index 85979ffd8169a75..59013f087199e3a 100644 --- a/Lib/test/test_email/test_asian_codecs.py +++ b/Lib/test/test_email/test_asian_codecs.py @@ -83,15 +83,15 @@ def test_chinese_codecs(self): h.append(s, Charset('big5hkscs')) eq(h.encode(), """\ Chinese =?gb2312?b?1tDOxA==?= =?gbk?b?1tDOxA==?= =?gb18030?b?1tDOxA==?= - =?hz?b?fntWUE5Efn0=?= =?big5?b?pKSk5Q==?= =?big5hkscs?b?pKSk5Q==?=""") + =?hz-gb-2312?b?fntWUE5Efn0=?= =?big5?b?pKSk5Q==?= =?big5-hkscs?b?pKSk5Q==?=""") eq(decode_header(h.encode()), [(b'Chinese ', None), (b'\xd6\xd0\xce\xc4', 'gb2312'), (b'\xd6\xd0\xce\xc4', 'gbk'), (b'\xd6\xd0\xce\xc4', 'gb18030'), - (b'~{VPND~}', 'hz'), + (b'~{VPND~}', 'hz-gb-2312'), (b'\xa4\xa4\xa4\xe5', 'big5'), - (b'\xa4\xa4\xa4\xe5', 'big5hkscs'), + (b'\xa4\xa4\xa4\xe5', 'big5-hkscs'), ]) def test_korean_codecs(self): diff --git a/Lib/test/test_email/test_contentmanager.py b/Lib/test/test_email/test_contentmanager.py index dceb54f15e48f4e..bc0e5d356181591 100644 --- a/Lib/test/test_email/test_contentmanager.py +++ b/Lib/test/test_email/test_contentmanager.py @@ -342,6 +342,19 @@ def test_set_text_charset_latin_1(self): self.assertEqual(m.get_payload(decode=True).decode('utf-8'), content) self.assertEqual(m.get_content(), content) + def test_set_text_charset_cp949(self): + m = self._make_message() + content = "\ud55c\uad6d\uc5b4\n\uac02\n" + raw_data_manager.set_content(m, content, charset='cp949') + self.assertEqual(str(m), textwrap.dedent("""\ + Content-Type: text/plain; charset="ks_c_5601-1987" + Content-Transfer-Encoding: base64 + + x9Gxub7uCoFBCg== + """)) + self.assertEqual(m.get_payload(decode=True).decode('ks_c_5601-1987'), content) + self.assertEqual(m.get_content(), content) + def test_set_text_plain_long_line_heuristics(self): m = self._make_message() content = ("Simple but long message that is over 78 characters" diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index d2c2261edbe04e1..19555d87085e176 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -4970,6 +4970,128 @@ def tearDown(self): except KeyError: pass + def test_attributes(self): + from email import charset + c = Charset() + self.assertEqual(c.input_charset, 'us-ascii') + self.assertEqual(c.header_encoding, None) + self.assertEqual(c.body_encoding, None) + self.assertEqual(c.output_charset, 'us-ascii') + self.assertEqual(c.input_codec, None) + self.assertEqual(c.output_codec, None) + + c = Charset('us-ascii') + self.assertEqual(c.input_charset, 'us-ascii') + self.assertEqual(c.header_encoding, None) + self.assertEqual(c.body_encoding, None) + self.assertEqual(c.output_charset, 'us-ascii') + self.assertEqual(c.input_codec, None) + self.assertEqual(c.output_codec, None) + + c = Charset('utf8') + self.assertEqual(c.input_charset, 'utf-8') + self.assertEqual(c.header_encoding, charset.SHORTEST) + self.assertEqual(c.body_encoding, charset.BASE64) + self.assertEqual(c.output_charset, 'utf-8') + self.assertEqual(c.input_codec, 'utf-8') + self.assertEqual(c.output_codec, 'utf-8') + + c = Charset('latin1') + self.assertEqual(c.input_charset, 'iso-8859-1') + self.assertEqual(c.header_encoding, charset.QP) + self.assertEqual(c.body_encoding, charset.QP) + self.assertEqual(c.output_charset, 'iso-8859-1') + self.assertEqual(c.input_codec, 'iso-8859-1') + self.assertEqual(c.output_codec, 'iso-8859-1') + + c = Charset('latin9') + self.assertEqual(c.input_charset, 'iso-8859-15') + self.assertEqual(c.header_encoding, charset.QP) + self.assertEqual(c.body_encoding, charset.QP) + self.assertEqual(c.output_charset, 'iso-8859-15') + self.assertEqual(c.input_codec, 'iso-8859-15') + self.assertEqual(c.output_codec, 'iso-8859-15') + + c = Charset('cyrillic') + self.assertEqual(c.input_charset, 'iso-8859-5') + self.assertEqual(c.header_encoding, charset.SHORTEST) + self.assertEqual(c.body_encoding, charset.BASE64) + self.assertEqual(c.output_charset, 'iso-8859-5') + self.assertEqual(c.input_codec, 'iso-8859-5') + self.assertEqual(c.output_codec, 'iso-8859-5') + + c = Charset('cp1251') + self.assertEqual(c.input_charset, 'windows-1251') + self.assertEqual(c.header_encoding, charset.SHORTEST) + self.assertEqual(c.body_encoding, charset.BASE64) + self.assertEqual(c.output_charset, 'windows-1251') + self.assertEqual(c.input_codec, 'windows-1251') + self.assertEqual(c.output_codec, 'windows-1251') + + c = Charset('cp1252') + self.assertEqual(c.input_charset, 'windows-1252') + self.assertEqual(c.header_encoding, charset.QP) + self.assertEqual(c.body_encoding, charset.QP) + self.assertEqual(c.output_charset, 'windows-1252') + self.assertEqual(c.input_codec, 'windows-1252') + self.assertEqual(c.output_codec, 'windows-1252') + + c = Charset('eucjp') + self.assertEqual(c.input_charset, 'euc-jp') + self.assertEqual(c.header_encoding, charset.BASE64) + self.assertEqual(c.body_encoding, None) + self.assertEqual(c.output_charset, 'iso-2022-jp') + self.assertEqual(c.input_codec, 'euc-jp') + self.assertEqual(c.output_codec, 'iso-2022-jp') + + c = Charset('cp949') + self.assertEqual(c.input_charset, 'ks_c_5601-1987') + self.assertEqual(c.header_encoding, charset.SHORTEST) + self.assertEqual(c.body_encoding, charset.BASE64) + self.assertEqual(c.output_charset, 'ks_c_5601-1987') + self.assertEqual(c.input_codec, 'ks_c_5601-1987') + self.assertEqual(c.output_codec, 'ks_c_5601-1987') + + c = Charset('gb2312') + self.assertEqual(c.input_charset, 'gb2312') + self.assertEqual(c.header_encoding, charset.BASE64) + self.assertEqual(c.body_encoding, charset.BASE64) + self.assertEqual(c.output_charset, 'gb2312') + self.assertEqual(c.input_codec, 'gb2312') + self.assertEqual(c.output_codec, 'gb2312') + + c = Charset('big5') + self.assertEqual(c.input_charset, 'big5') + self.assertEqual(c.header_encoding, charset.BASE64) + self.assertEqual(c.body_encoding, charset.BASE64) + self.assertEqual(c.output_charset, 'big5') + self.assertEqual(c.input_codec, 'big5') + self.assertEqual(c.output_codec, 'big5') + + def test_user_charsets(self): + from email import charset + c = Charset('fake0') + self.assertEqual(c.input_charset, 'fake0') + self.assertEqual(c.header_encoding, charset.SHORTEST) + self.assertEqual(c.body_encoding, charset.BASE64) + self.assertEqual(c.output_charset, 'fake0') + self.assertEqual(c.input_codec, 'fake0') + self.assertEqual(c.output_codec, 'fake0') + + charset.add_alias('fake1', 'mime-fake') + charset.add_alias('output-mime-fake', 'output-mime-fake-alias') + charset.add_codec('mime-fake', 'fakecodec') + charset.add_codec('output-mime-fake-alias', 'outputfakecodec') + charset.add_charset('mime-fake', charset.QP, None, 'output-mime-fake') + + c = Charset('fake1') + self.assertEqual(c.input_charset, 'mime-fake') + self.assertEqual(c.header_encoding, charset.QP) + self.assertEqual(c.body_encoding, None) + self.assertEqual(c.output_charset, 'output-mime-fake-alias') + self.assertEqual(c.input_codec, 'fakecodec') + self.assertEqual(c.output_codec, 'outputfakecodec') + def test_codec_encodeable(self): eq = self.assertEqual # Make sure us-ascii = no Unicode conversion @@ -5010,6 +5132,11 @@ def test_unicode_charset_name(self): self.assertEqual(str(charset), 'us-ascii') self.assertRaises(errors.CharsetError, Charset, 'asc\xffii') + def test_bytes_charset_name(self): + charset = Charset(b'us-ascii') + self.assertEqual(str(charset), 'us-ascii') + self.assertRaises(errors.CharsetError, Charset, b'asc\xffii') + # Test multilingual MIME headers. diff --git a/Misc/NEWS.d/next/Library/2026-05-17-12-37-59.gh-issue-53144.c5tr1p.rst b/Misc/NEWS.d/next/Library/2026-05-17-12-37-59.gh-issue-53144.c5tr1p.rst new file mode 100644 index 000000000000000..283a5ba44d1f19f --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-05-17-12-37-59.gh-issue-53144.c5tr1p.rst @@ -0,0 +1,2 @@ +The :mod:`email` package now supports all aliases of Python codecs and uses +MIME/IANA names for all IANA registered charsets.