499 lines
31 KiB
HTML
499 lines
31 KiB
HTML
|
||
<!DOCTYPE html>
|
||
<html lang="en">
|
||
<head>
|
||
<meta charset="utf-8">
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||
<meta name="color-scheme" content="light dark">
|
||
<title>PEP 540 – Add a new UTF-8 Mode | peps.python.org</title>
|
||
<link rel="shortcut icon" href="../_static/py.png">
|
||
<link rel="canonical" href="https://peps.python.org/pep-0540/">
|
||
<link rel="stylesheet" href="../_static/style.css" type="text/css">
|
||
<link rel="stylesheet" href="../_static/mq.css" type="text/css">
|
||
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" media="(prefers-color-scheme: light)" id="pyg-light">
|
||
<link rel="stylesheet" href="../_static/pygments_dark.css" type="text/css" media="(prefers-color-scheme: dark)" id="pyg-dark">
|
||
<link rel="alternate" type="application/rss+xml" title="Latest PEPs" href="https://peps.python.org/peps.rss">
|
||
<meta property="og:title" content='PEP 540 – Add a new UTF-8 Mode | peps.python.org'>
|
||
<meta property="og:description" content="Add a new “UTF-8 Mode” to enhance Python’s use of UTF-8. When UTF-8 Mode is active, Python will:">
|
||
<meta property="og:type" content="website">
|
||
<meta property="og:url" content="https://peps.python.org/pep-0540/">
|
||
<meta property="og:site_name" content="Python Enhancement Proposals (PEPs)">
|
||
<meta property="og:image" content="https://peps.python.org/_static/og-image.png">
|
||
<meta property="og:image:alt" content="Python PEPs">
|
||
<meta property="og:image:width" content="200">
|
||
<meta property="og:image:height" content="200">
|
||
<meta name="description" content="Add a new “UTF-8 Mode” to enhance Python’s use of UTF-8. When UTF-8 Mode is active, Python will:">
|
||
<meta name="theme-color" content="#3776ab">
|
||
</head>
|
||
<body>
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
|
||
<symbol id="svg-sun-half" viewBox="0 0 24 24" pointer-events="all">
|
||
<title>Following system colour scheme</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none"
|
||
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||
<circle cx="12" cy="12" r="9"></circle>
|
||
<path d="M12 3v18m0-12l4.65-4.65M12 14.3l7.37-7.37M12 19.6l8.85-8.85"></path>
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-moon" viewBox="0 0 24 24" pointer-events="all">
|
||
<title>Selected dark colour scheme</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none"
|
||
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||
<path stroke="none" d="M0 0h24v24H0z" fill="none"></path>
|
||
<path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z"></path>
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-sun" viewBox="0 0 24 24" pointer-events="all">
|
||
<title>Selected light colour scheme</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none"
|
||
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||
<circle cx="12" cy="12" r="5"></circle>
|
||
<line x1="12" y1="1" x2="12" y2="3"></line>
|
||
<line x1="12" y1="21" x2="12" y2="23"></line>
|
||
<line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
|
||
<line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
|
||
<line x1="1" y1="12" x2="3" y2="12"></line>
|
||
<line x1="21" y1="12" x2="23" y2="12"></line>
|
||
<line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
|
||
<line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
|
||
</svg>
|
||
</symbol>
|
||
</svg>
|
||
<script>
|
||
|
||
document.documentElement.dataset.colour_scheme = localStorage.getItem("colour_scheme") || "auto"
|
||
</script>
|
||
<section id="pep-page-section">
|
||
<header>
|
||
<h1>Python Enhancement Proposals</h1>
|
||
<ul class="breadcrumbs">
|
||
<li><a href="https://www.python.org/" title="The Python Programming Language">Python</a> » </li>
|
||
<li><a href="../pep-0000/">PEP Index</a> » </li>
|
||
<li>PEP 540</li>
|
||
</ul>
|
||
<button id="colour-scheme-cycler" onClick="setColourScheme(nextColourScheme())">
|
||
<svg aria-hidden="true" class="colour-scheme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
|
||
<svg aria-hidden="true" class="colour-scheme-icon-when-dark"><use href="#svg-moon"></use></svg>
|
||
<svg aria-hidden="true" class="colour-scheme-icon-when-light"><use href="#svg-sun"></use></svg>
|
||
<span class="visually-hidden">Toggle light / dark / auto colour theme</span>
|
||
</button>
|
||
</header>
|
||
<article>
|
||
<section id="pep-content">
|
||
<h1 class="page-title">PEP 540 – Add a new UTF-8 Mode</h1>
|
||
<dl class="rfc2822 field-list simple">
|
||
<dt class="field-odd">Author<span class="colon">:</span></dt>
|
||
<dd class="field-odd">Victor Stinner <vstinner at python.org></dd>
|
||
<dt class="field-even">BDFL-Delegate<span class="colon">:</span></dt>
|
||
<dd class="field-even">INADA Naoki</dd>
|
||
<dt class="field-odd">Status<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><abbr title="Accepted and implementation complete, or no longer active">Final</abbr></dd>
|
||
<dt class="field-even">Type<span class="colon">:</span></dt>
|
||
<dd class="field-even"><abbr title="Normative PEP with a new feature for Python, implementation change for CPython or interoperability standard for the ecosystem">Standards Track</abbr></dd>
|
||
<dt class="field-odd">Created<span class="colon">:</span></dt>
|
||
<dd class="field-odd">05-Jan-2016</dd>
|
||
<dt class="field-even">Python-Version<span class="colon">:</span></dt>
|
||
<dd class="field-even">3.7</dd>
|
||
<dt class="field-odd">Resolution<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><a class="reference external" href="https://mail.python.org/pipermail/python-dev/2017-December/151173.html">Python-Dev message</a></dd>
|
||
</dl>
|
||
<hr class="docutils" />
|
||
<section id="contents">
|
||
<details><summary>Table of Contents</summary><ul class="simple">
|
||
<li><a class="reference internal" href="#abstract">Abstract</a></li>
|
||
<li><a class="reference internal" href="#rationale">Rationale</a><ul>
|
||
<li><a class="reference internal" href="#locale-encoding-and-utf-8">Locale encoding and UTF-8</a></li>
|
||
<li><a class="reference internal" href="#passthrough-for-undecodable-bytes-surrogateescape">Passthrough for undecodable bytes: surrogateescape</a></li>
|
||
<li><a class="reference internal" href="#no-change-by-default-for-best-backward-compatibility">No change by default for best backward compatibility</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a class="reference internal" href="#proposal">Proposal</a></li>
|
||
<li><a class="reference internal" href="#relationship-with-the-locale-coercion-pep-538">Relationship with the locale coercion (PEP 538)</a></li>
|
||
<li><a class="reference internal" href="#backward-compatibility">Backward Compatibility</a></li>
|
||
<li><a class="reference internal" href="#annex-encodings-and-error-handlers">Annex: Encodings And Error Handlers</a><ul>
|
||
<li><a class="reference internal" href="#encoding-and-error-handler">Encoding and error handler</a></li>
|
||
<li><a class="reference internal" href="#encoding-and-error-handler-on-windows">Encoding and error handler on Windows</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a class="reference internal" href="#links">Links</a></li>
|
||
<li><a class="reference internal" href="#post-history">Post History</a></li>
|
||
<li><a class="reference internal" href="#version-history">Version History</a></li>
|
||
<li><a class="reference internal" href="#copyright">Copyright</a></li>
|
||
</ul>
|
||
</details></section>
|
||
<section id="abstract">
|
||
<h2><a class="toc-backref" href="#abstract" role="doc-backlink">Abstract</a></h2>
|
||
<p>Add a new “UTF-8 Mode” to enhance Python’s use of UTF-8. When UTF-8 Mode
|
||
is active, Python will:</p>
|
||
<ul class="simple">
|
||
<li>use the <code class="docutils literal notranslate"><span class="pre">utf-8</span></code> encoding, regardless of the locale currently set by
|
||
the current platform, and</li>
|
||
<li>change the <code class="docutils literal notranslate"><span class="pre">stdin</span></code> and <code class="docutils literal notranslate"><span class="pre">stdout</span></code> error handlers to
|
||
<code class="docutils literal notranslate"><span class="pre">surrogateescape</span></code>.</li>
|
||
</ul>
|
||
<p>This mode is off by default, but is automatically activated when using
|
||
the “POSIX” locale.</p>
|
||
<p>Add the <code class="docutils literal notranslate"><span class="pre">-X</span> <span class="pre">utf8</span></code> command line option and <code class="docutils literal notranslate"><span class="pre">PYTHONUTF8</span></code> environment
|
||
variable to control UTF-8 Mode.</p>
|
||
</section>
|
||
<section id="rationale">
|
||
<h2><a class="toc-backref" href="#rationale" role="doc-backlink">Rationale</a></h2>
|
||
<section id="locale-encoding-and-utf-8">
|
||
<h3><a class="toc-backref" href="#locale-encoding-and-utf-8" role="doc-backlink">Locale encoding and UTF-8</a></h3>
|
||
<p>Python 3.6 uses the locale encoding for filenames, environment
|
||
variables, standard streams, etc. The locale encoding is inherited from
|
||
the locale; the encoding and the locale are tightly coupled.</p>
|
||
<p>Many users inherit the ASCII encoding from the POSIX locale, aka the “C”
|
||
locale, but are unable change the locale for various reasons. This
|
||
encoding is very limited in term of Unicode support: any non-ASCII
|
||
character is likely to cause trouble.</p>
|
||
<p>It isn’t always easy to get an accurate locale. Locales don’t get the
|
||
exact same name on different Linux distributions, FreeBSD, macOS, etc.
|
||
And some locales, like the recent <code class="docutils literal notranslate"><span class="pre">C.UTF-8</span></code> locale, are only supported
|
||
by a few platforms. The current locale can even vary on the <em>same</em>
|
||
platform depending on context; for example, a SSH connection can use a
|
||
different encoding than the filesystem or local terminal encoding on the
|
||
same machine.</p>
|
||
<p>On the flip side, Python 3.6 is already using UTF-8 by default on macOS,
|
||
Android and Windows (<a class="pep reference internal" href="../pep-0529/" title="PEP 529 – Change Windows filesystem encoding to UTF-8">PEP 529</a>) for most functions – although
|
||
<code class="docutils literal notranslate"><span class="pre">open()</span></code> is a notable exception here. UTF-8 is also the default
|
||
encoding of Python scripts, XML and JSON file formats. The Go
|
||
programming language
|
||
uses UTF-8 for all strings.</p>
|
||
<p>UTF-8 support is nearly ubiquitous for data read and written by modern
|
||
platforms. It also has excellent support in Python. The problem is
|
||
simply that the locale is frequently misconfigured. An obvious solution
|
||
suggests itself: ignore the locale encoding and use UTF-8.</p>
|
||
</section>
|
||
<section id="passthrough-for-undecodable-bytes-surrogateescape">
|
||
<h3><a class="toc-backref" href="#passthrough-for-undecodable-bytes-surrogateescape" role="doc-backlink">Passthrough for undecodable bytes: surrogateescape</a></h3>
|
||
<p>When decoding bytes from UTF-8 using the default <code class="docutils literal notranslate"><span class="pre">strict</span></code> error
|
||
handler, Python 3 raises a <code class="docutils literal notranslate"><span class="pre">UnicodeDecodeError</span></code> on the first
|
||
undecodable byte.</p>
|
||
<p>Unix command line tools like <code class="docutils literal notranslate"><span class="pre">cat</span></code> or <code class="docutils literal notranslate"><span class="pre">grep</span></code> and most Python 2
|
||
applications simply do not have this class of bugs: they don’t decode
|
||
data, but process data as a raw bytes sequence.</p>
|
||
<p>Python 3 already has a solution to behave like Unix tools and Python 2:
|
||
the <code class="docutils literal notranslate"><span class="pre">surrogateescape</span></code> error handler (<a class="pep reference internal" href="../pep-0383/" title="PEP 383 – Non-decodable Bytes in System Character Interfaces">PEP 383</a>). It allows processing
|
||
data as if it were bytes, but uses Unicode in practice; undecodable
|
||
bytes are stored as surrogate characters.</p>
|
||
<p>UTF-8 Mode sets the <code class="docutils literal notranslate"><span class="pre">surrogateescape</span></code> error handler for <code class="docutils literal notranslate"><span class="pre">stdin</span></code>
|
||
and <code class="docutils literal notranslate"><span class="pre">stdout</span></code>, since these streams as commonly associated to Unix
|
||
command line tools.</p>
|
||
<p>However, users have a different expectation on files. Files are expected
|
||
to be properly encoded, and Python is expected to fail early when
|
||
<code class="docutils literal notranslate"><span class="pre">open()</span></code> is called with the wrong options, like opening a JPEG picture
|
||
in text mode. The <code class="docutils literal notranslate"><span class="pre">open()</span></code> default error handler remains <code class="docutils literal notranslate"><span class="pre">strict</span></code>
|
||
for these reasons.</p>
|
||
</section>
|
||
<section id="no-change-by-default-for-best-backward-compatibility">
|
||
<h3><a class="toc-backref" href="#no-change-by-default-for-best-backward-compatibility" role="doc-backlink">No change by default for best backward compatibility</a></h3>
|
||
<p>While UTF-8 is perfect in most cases, sometimes the locale encoding is
|
||
actually the best encoding.</p>
|
||
<p>This PEP changes the behaviour for the POSIX locale since this locale is
|
||
usually equivalent to the ASCII encoding, whereas UTF-8 is a much better
|
||
choice. It does not change the behaviour for other locales to prevent
|
||
any risk or regression.</p>
|
||
<p>As users are responsible to enable explicitly the new UTF-8 Mode for
|
||
these other locales, they are responsible for any potential mojibake
|
||
issues caused by UTF-8 Mode.</p>
|
||
</section>
|
||
</section>
|
||
<section id="proposal">
|
||
<h2><a class="toc-backref" href="#proposal" role="doc-backlink">Proposal</a></h2>
|
||
<p>Add a new UTF-8 Mode to use the UTF-8 encoding, ignore the locale
|
||
encoding, and change <code class="docutils literal notranslate"><span class="pre">stdin</span></code> and <code class="docutils literal notranslate"><span class="pre">stdout</span></code> error handlers to
|
||
<code class="docutils literal notranslate"><span class="pre">surrogateescape</span></code>.</p>
|
||
<p>Add the new <code class="docutils literal notranslate"><span class="pre">-X</span> <span class="pre">utf8</span></code> command line option and <code class="docutils literal notranslate"><span class="pre">PYTHONUTF8</span></code>
|
||
environment variable. Users can explicitly activate UTF-8 Mode with the
|
||
command-line option <code class="docutils literal notranslate"><span class="pre">-X</span> <span class="pre">utf8</span></code> or by setting the environment variable
|
||
<code class="docutils literal notranslate"><span class="pre">PYTHONUTF8=1</span></code>.</p>
|
||
<p>This mode is disabled by default and enabled by the POSIX locale. Users
|
||
can explicitly disable UTF-8 Mode with the command-line option <code class="docutils literal notranslate"><span class="pre">-X</span>
|
||
<span class="pre">utf8=0</span></code> or by setting the environment variable <code class="docutils literal notranslate"><span class="pre">PYTHONUTF8=0</span></code>.</p>
|
||
<p>For standard streams, the <code class="docutils literal notranslate"><span class="pre">PYTHONIOENCODING</span></code> environment variable has
|
||
priority over UTF-8 Mode.</p>
|
||
<p>On Windows, the <code class="docutils literal notranslate"><span class="pre">PYTHONLEGACYWINDOWSFSENCODING</span></code> environment variable
|
||
(<a class="pep reference internal" href="../pep-0529/" title="PEP 529 – Change Windows filesystem encoding to UTF-8">PEP 529</a>) has the priority over UTF-8 Mode.</p>
|
||
<p>Effects of UTF-8 Mode:</p>
|
||
<ul class="simple">
|
||
<li><code class="docutils literal notranslate"><span class="pre">sys.getfilesystemencoding()</span></code> returns <code class="docutils literal notranslate"><span class="pre">'UTF-8'</span></code>.</li>
|
||
<li><code class="docutils literal notranslate"><span class="pre">locale.getpreferredencoding()</span></code> returns <code class="docutils literal notranslate"><span class="pre">UTF-8</span></code>; its
|
||
<em>do_setlocale</em> argument, and the locale encoding, are ignored.</li>
|
||
<li><code class="docutils literal notranslate"><span class="pre">sys.stdin</span></code> and <code class="docutils literal notranslate"><span class="pre">sys.stdout</span></code> error handler is set to
|
||
<code class="docutils literal notranslate"><span class="pre">surrogateescape</span></code>.</li>
|
||
</ul>
|
||
<p>Side effects:</p>
|
||
<ul class="simple">
|
||
<li><code class="docutils literal notranslate"><span class="pre">open()</span></code> uses the UTF-8 encoding by default. However, it still
|
||
uses the <code class="docutils literal notranslate"><span class="pre">strict</span></code> error handler by default.</li>
|
||
<li><code class="docutils literal notranslate"><span class="pre">os.fsdecode()</span></code> and <code class="docutils literal notranslate"><span class="pre">os.fsencode()</span></code> use the UTF-8 encoding.</li>
|
||
<li>Command line arguments, environment variables and filenames use the
|
||
UTF-8 encoding.</li>
|
||
</ul>
|
||
</section>
|
||
<section id="relationship-with-the-locale-coercion-pep-538">
|
||
<h2><a class="toc-backref" href="#relationship-with-the-locale-coercion-pep-538" role="doc-backlink">Relationship with the locale coercion (PEP 538)</a></h2>
|
||
<p>The POSIX locale enables the locale coercion (<a class="pep reference internal" href="../pep-0538/" title="PEP 538 – Coercing the legacy C locale to a UTF-8 based locale">PEP 538</a>) and the UTF-8
|
||
mode (<a class="pep reference internal" href="../pep-0540/" title="PEP 540 – Add a new UTF-8 Mode">PEP 540</a>). When the locale coercion is enabled, enabling the
|
||
UTF-8 mode has no additional effect.</p>
|
||
<p>The UTF-8 Mode has the same effect as locale coercion:</p>
|
||
<ul class="simple">
|
||
<li><code class="docutils literal notranslate"><span class="pre">sys.getfilesystemencoding()</span></code> returns <code class="docutils literal notranslate"><span class="pre">'UTF-8'</span></code>,</li>
|
||
<li><code class="docutils literal notranslate"><span class="pre">locale.getpreferredencoding()</span></code> returns <code class="docutils literal notranslate"><span class="pre">UTF-8</span></code>, and</li>
|
||
<li>the <code class="docutils literal notranslate"><span class="pre">sys.stdin</span></code> and <code class="docutils literal notranslate"><span class="pre">sys.stdout</span></code> error handlers are set to
|
||
<code class="docutils literal notranslate"><span class="pre">surrogateescape</span></code>.</li>
|
||
</ul>
|
||
<p>These changes only affect Python code. But the locale coercion has
|
||
additional effects: the <code class="docutils literal notranslate"><span class="pre">LC_CTYPE</span></code> environment variable and the
|
||
<code class="docutils literal notranslate"><span class="pre">LC_CTYPE</span></code> locale are set to a UTF-8 locale like <code class="docutils literal notranslate"><span class="pre">C.UTF-8</span></code>. One side
|
||
effect is that non-Python code is also impacted by the locale coercion.
|
||
The two PEPs are complementary.</p>
|
||
<p>On platforms like Centos 7 where locale coercion is not supported, the
|
||
POSIX locale only enables UTF-8 Mode. In this case, Python code uses
|
||
the UTF-8 encoding and ignores the locale encoding, whereas non-Python
|
||
code uses the locale encoding, which is usually ASCII for the POSIX
|
||
locale.</p>
|
||
<p>While the UTF-8 Mode is supported on all platforms and can be enabled
|
||
with any locale, the locale coercion is not supported by all platforms
|
||
and is restricted to the POSIX locale.</p>
|
||
<p>The UTF-8 Mode has only an impact on Python child processes when the
|
||
<code class="docutils literal notranslate"><span class="pre">PYTHONUTF8</span></code> environment variable is set to <code class="docutils literal notranslate"><span class="pre">1</span></code>, whereas the locale
|
||
coercion sets the <code class="docutils literal notranslate"><span class="pre">LC_CTYPE</span></code> environment variables which impacts all
|
||
child processes.</p>
|
||
<p>The benefit of the locale coercion approach is that it helps ensure that
|
||
encoding handling in binary extension modules and child processes is
|
||
consistent with Python’s encoding handling. The upside of the UTF-8 Mode
|
||
approach is that it allows an embedding application to change the
|
||
interpreter’s behaviour without having to change the process global
|
||
locale settings.</p>
|
||
</section>
|
||
<section id="backward-compatibility">
|
||
<h2><a class="toc-backref" href="#backward-compatibility" role="doc-backlink">Backward Compatibility</a></h2>
|
||
<p>The only backward incompatible change is that the POSIX locale now
|
||
enables the UTF-8 Mode by default: it will now use the UTF-8 encoding,
|
||
ignore the locale encoding, and change <code class="docutils literal notranslate"><span class="pre">stdin</span></code> and <code class="docutils literal notranslate"><span class="pre">stdout</span></code> error
|
||
handlers to <code class="docutils literal notranslate"><span class="pre">surrogateescape</span></code>.</p>
|
||
</section>
|
||
<section id="annex-encodings-and-error-handlers">
|
||
<h2><a class="toc-backref" href="#annex-encodings-and-error-handlers" role="doc-backlink">Annex: Encodings And Error Handlers</a></h2>
|
||
<p>UTF-8 Mode changes the default encoding and error handler used by
|
||
<code class="docutils literal notranslate"><span class="pre">open()</span></code>, <code class="docutils literal notranslate"><span class="pre">os.fsdecode()</span></code>, <code class="docutils literal notranslate"><span class="pre">os.fsencode()</span></code>, <code class="docutils literal notranslate"><span class="pre">sys.stdin</span></code>,
|
||
<code class="docutils literal notranslate"><span class="pre">sys.stdout</span></code> and <code class="docutils literal notranslate"><span class="pre">sys.stderr</span></code>.</p>
|
||
<section id="encoding-and-error-handler">
|
||
<h3><a class="toc-backref" href="#encoding-and-error-handler" role="doc-backlink">Encoding and error handler</a></h3>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head">Function</th>
|
||
<th class="head">Default</th>
|
||
<th class="head">UTF-8 Mode or POSIX locale</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td>open()</td>
|
||
<td>locale/strict</td>
|
||
<td><strong>UTF-8</strong>/strict</td>
|
||
</tr>
|
||
<tr class="row-odd"><td>os.fsdecode(), os.fsencode()</td>
|
||
<td>locale/surrogateescape</td>
|
||
<td><strong>UTF-8</strong>/surrogateescape</td>
|
||
</tr>
|
||
<tr class="row-even"><td>sys.stdin, sys.stdout</td>
|
||
<td>locale/strict</td>
|
||
<td><strong>UTF-8/surrogateescape</strong></td>
|
||
</tr>
|
||
<tr class="row-odd"><td>sys.stderr</td>
|
||
<td>locale/backslashreplace</td>
|
||
<td><strong>UTF-8</strong>/backslashreplace</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p>By comparison, Python 3.6 uses:</p>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head">Function</th>
|
||
<th class="head">Default</th>
|
||
<th class="head">POSIX locale</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td>open()</td>
|
||
<td>locale/strict</td>
|
||
<td>locale/strict</td>
|
||
</tr>
|
||
<tr class="row-odd"><td>os.fsdecode(), os.fsencode()</td>
|
||
<td>locale/surrogateescape</td>
|
||
<td>locale/surrogateescape</td>
|
||
</tr>
|
||
<tr class="row-even"><td>sys.stdin, sys.stdout</td>
|
||
<td>locale/strict</td>
|
||
<td>locale/<strong>surrogateescape</strong></td>
|
||
</tr>
|
||
<tr class="row-odd"><td>sys.stderr</td>
|
||
<td>locale/backslashreplace</td>
|
||
<td>locale/backslashreplace</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="encoding-and-error-handler-on-windows">
|
||
<h3><a class="toc-backref" href="#encoding-and-error-handler-on-windows" role="doc-backlink">Encoding and error handler on Windows</a></h3>
|
||
<p>On Windows, the encodings and error handlers are different:</p>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head">Function</th>
|
||
<th class="head">Default</th>
|
||
<th class="head">Legacy Windows FS encoding</th>
|
||
<th class="head">UTF-8 Mode</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td>open()</td>
|
||
<td>mbcs/strict</td>
|
||
<td>mbcs/strict</td>
|
||
<td><strong>UTF-8</strong>/strict</td>
|
||
</tr>
|
||
<tr class="row-odd"><td>os.fsdecode(), os.fsencode()</td>
|
||
<td>UTF-8/surrogatepass</td>
|
||
<td><strong>mbcs/replace</strong></td>
|
||
<td>UTF-8/surrogatepass</td>
|
||
</tr>
|
||
<tr class="row-even"><td>sys.stdin, sys.stdout</td>
|
||
<td>UTF-8/surrogateescape</td>
|
||
<td>UTF-8/surrogateescape</td>
|
||
<td>UTF-8/surrogateescape</td>
|
||
</tr>
|
||
<tr class="row-odd"><td>sys.stderr</td>
|
||
<td>UTF-8/backslashreplace</td>
|
||
<td>UTF-8/backslashreplace</td>
|
||
<td>UTF-8/backslashreplace</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p>By comparison, Python 3.6 uses:</p>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head">Function</th>
|
||
<th class="head">Default</th>
|
||
<th class="head">Legacy Windows FS encoding</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td>open()</td>
|
||
<td>mbcs/strict</td>
|
||
<td>mbcs/strict</td>
|
||
</tr>
|
||
<tr class="row-odd"><td>os.fsdecode(), os.fsencode()</td>
|
||
<td>UTF-8/surrogatepass</td>
|
||
<td><strong>mbcs/replace</strong></td>
|
||
</tr>
|
||
<tr class="row-even"><td>sys.stdin, sys.stdout</td>
|
||
<td>UTF-8/surrogateescape</td>
|
||
<td>UTF-8/surrogateescape</td>
|
||
</tr>
|
||
<tr class="row-odd"><td>sys.stderr</td>
|
||
<td>UTF-8/backslashreplace</td>
|
||
<td>UTF-8/backslashreplace</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p>The “Legacy Windows FS encoding” is enabled by the
|
||
<code class="docutils literal notranslate"><span class="pre">PYTHONLEGACYWINDOWSFSENCODING</span></code> environment variable.</p>
|
||
<p>If stdin and/or stdout is redirected to a pipe, <code class="docutils literal notranslate"><span class="pre">sys.stdin</span></code> and/or
|
||
<code class="docutils literal notranslate"><span class="pre">sys.output</span></code> uses <code class="docutils literal notranslate"><span class="pre">mbcs</span></code> encoding by default rather than UTF-8.
|
||
But in UTF-8 Mode, <code class="docutils literal notranslate"><span class="pre">sys.stdin</span></code> and <code class="docutils literal notranslate"><span class="pre">sys.stdout</span></code> always use the UTF-8
|
||
encoding.</p>
|
||
<div class="admonition note">
|
||
<p class="admonition-title">Note</p>
|
||
<p>There is no POSIX locale on Windows. The ANSI code page is used as
|
||
the locale encoding, and this code page never uses the ASCII
|
||
encoding.</p>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="links">
|
||
<h2><a class="toc-backref" href="#links" role="doc-backlink">Links</a></h2>
|
||
<ul class="simple">
|
||
<li><a class="reference external" href="http://bugs.python.org/issue29240">bpo-29240: Implementation of the PEP 540: Add a new UTF-8 Mode</a></li>
|
||
<li><a class="pep reference internal" href="../pep-0538/" title="PEP 538 – Coercing the legacy C locale to a UTF-8 based locale">PEP 538</a>:
|
||
“Coercing the legacy C locale to C.UTF-8”</li>
|
||
<li><a class="pep reference internal" href="../pep-0529/" title="PEP 529 – Change Windows filesystem encoding to UTF-8">PEP 529</a>:
|
||
“Change Windows filesystem encoding to UTF-8”</li>
|
||
<li><a class="pep reference internal" href="../pep-0528/" title="PEP 528 – Change Windows console encoding to UTF-8">PEP 528</a>:
|
||
“Change Windows console encoding to UTF-8”</li>
|
||
<li><a class="pep reference internal" href="../pep-0383/" title="PEP 383 – Non-decodable Bytes in System Character Interfaces">PEP 383</a>:
|
||
“Non-decodable Bytes in System Character Interfaces”</li>
|
||
</ul>
|
||
</section>
|
||
<section id="post-history">
|
||
<h2><a class="toc-backref" href="#post-history" role="doc-backlink">Post History</a></h2>
|
||
<ul class="simple">
|
||
<li>2017-12: <a class="reference external" href="https://mail.python.org/pipermail/python-dev/2017-December/151054.html">[Python-Dev] PEP 540: Add a new UTF-8 Mode</a></li>
|
||
<li>2017-04: <a class="reference external" href="https://mail.python.org/pipermail/python-dev/2017-April/147795.html">[Python-Dev] Proposed BDFL Delegate update for PEPs 538 &
|
||
540 (assuming UTF-8 for *nix system boundaries)</a></li>
|
||
<li>2017-01: <a class="reference external" href="https://mail.python.org/pipermail/python-ideas/2017-January/044089.html">[Python-ideas] PEP 540: Add a new UTF-8 Mode</a></li>
|
||
<li>2017-01: <a class="reference external" href="https://bugs.python.org/issue28180#msg284764">bpo-28180: Implementation of the PEP 538: coerce C locale to
|
||
C.utf-8 (msg284764)</a></li>
|
||
<li>2016-08-17: <a class="reference external" href="https://bugs.python.org/issue27781#msg272916">bpo-27781: Change sys.getfilesystemencoding() on Windows
|
||
to UTF-8 (msg272916)</a>
|
||
– Victor proposed <code class="docutils literal notranslate"><span class="pre">-X</span> <span class="pre">utf8</span></code> for the <a class="pep reference internal" href="../pep-0529/" title="PEP 529 – Change Windows filesystem encoding to UTF-8">PEP 529</a> (Change Windows
|
||
filesystem encoding to UTF-8)</li>
|
||
</ul>
|
||
</section>
|
||
<section id="version-history">
|
||
<h2><a class="toc-backref" href="#version-history" role="doc-backlink">Version History</a></h2>
|
||
<ul class="simple">
|
||
<li>Version 4: <code class="docutils literal notranslate"><span class="pre">locale.getpreferredencoding()</span></code> now returns <code class="docutils literal notranslate"><span class="pre">'UTF-8'</span></code>
|
||
in the UTF-8 Mode.</li>
|
||
<li>Version 3: The UTF-8 Mode does not change the <code class="docutils literal notranslate"><span class="pre">open()</span></code> default error
|
||
handler (<code class="docutils literal notranslate"><span class="pre">strict</span></code>) anymore, and the Strict UTF-8 Mode has been
|
||
removed.</li>
|
||
<li>Version 2: Rewrite the PEP from scratch to make it much shorter and
|
||
easier to understand.</li>
|
||
<li>Version 1: First version posted to python-dev.</li>
|
||
</ul>
|
||
</section>
|
||
<section id="copyright">
|
||
<h2><a class="toc-backref" href="#copyright" role="doc-backlink">Copyright</a></h2>
|
||
<p>This document has been placed in the public domain.</p>
|
||
</section>
|
||
</section>
|
||
<hr class="docutils" />
|
||
<p>Source: <a class="reference external" href="https://github.com/python/peps/blob/main/peps/pep-0540.rst">https://github.com/python/peps/blob/main/peps/pep-0540.rst</a></p>
|
||
<p>Last modified: <a class="reference external" href="https://github.com/python/peps/commits/main/peps/pep-0540.rst">2024-08-20 10:29:32 GMT</a></p>
|
||
|
||
</article>
|
||
<nav id="pep-sidebar">
|
||
<h2>Contents</h2>
|
||
<ul>
|
||
<li><a class="reference internal" href="#abstract">Abstract</a></li>
|
||
<li><a class="reference internal" href="#rationale">Rationale</a><ul>
|
||
<li><a class="reference internal" href="#locale-encoding-and-utf-8">Locale encoding and UTF-8</a></li>
|
||
<li><a class="reference internal" href="#passthrough-for-undecodable-bytes-surrogateescape">Passthrough for undecodable bytes: surrogateescape</a></li>
|
||
<li><a class="reference internal" href="#no-change-by-default-for-best-backward-compatibility">No change by default for best backward compatibility</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a class="reference internal" href="#proposal">Proposal</a></li>
|
||
<li><a class="reference internal" href="#relationship-with-the-locale-coercion-pep-538">Relationship with the locale coercion (PEP 538)</a></li>
|
||
<li><a class="reference internal" href="#backward-compatibility">Backward Compatibility</a></li>
|
||
<li><a class="reference internal" href="#annex-encodings-and-error-handlers">Annex: Encodings And Error Handlers</a><ul>
|
||
<li><a class="reference internal" href="#encoding-and-error-handler">Encoding and error handler</a></li>
|
||
<li><a class="reference internal" href="#encoding-and-error-handler-on-windows">Encoding and error handler on Windows</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a class="reference internal" href="#links">Links</a></li>
|
||
<li><a class="reference internal" href="#post-history">Post History</a></li>
|
||
<li><a class="reference internal" href="#version-history">Version History</a></li>
|
||
<li><a class="reference internal" href="#copyright">Copyright</a></li>
|
||
</ul>
|
||
|
||
<br>
|
||
<a id="source" href="https://github.com/python/peps/blob/main/peps/pep-0540.rst">Page Source (GitHub)</a>
|
||
</nav>
|
||
</section>
|
||
<script src="../_static/colour_scheme.js"></script>
|
||
<script src="../_static/wrap_tables.js"></script>
|
||
<script src="../_static/sticky_banner.js"></script>
|
||
</body>
|
||
</html> |