331 lines
22 KiB
HTML
331 lines
22 KiB
HTML
|
||
<!DOCTYPE html>
|
||
<html lang="en">
|
||
<head>
|
||
<meta charset="utf-8">
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||
<meta name="color-scheme" content="light dark">
|
||
<title>PEP 223 – Change the Meaning of \x Escapes | peps.python.org</title>
|
||
<link rel="shortcut icon" href="../_static/py.png">
|
||
<link rel="canonical" href="https://peps.python.org/pep-0223/">
|
||
<link rel="stylesheet" href="../_static/style.css" type="text/css">
|
||
<link rel="stylesheet" href="../_static/mq.css" type="text/css">
|
||
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" media="(prefers-color-scheme: light)" id="pyg-light">
|
||
<link rel="stylesheet" href="../_static/pygments_dark.css" type="text/css" media="(prefers-color-scheme: dark)" id="pyg-dark">
|
||
<link rel="alternate" type="application/rss+xml" title="Latest PEPs" href="https://peps.python.org/peps.rss">
|
||
<meta property="og:title" content='PEP 223 – Change the Meaning of \x Escapes | peps.python.org'>
|
||
<meta property="og:description" content="Change \x escapes, in both 8-bit and Unicode strings, to consume exactly the two hex digits following. The proposal views this as correcting an original design flaw, leading to clearer expression in all flavors of string, a cleaner Unicode story, bette...">
|
||
<meta property="og:type" content="website">
|
||
<meta property="og:url" content="https://peps.python.org/pep-0223/">
|
||
<meta property="og:site_name" content="Python Enhancement Proposals (PEPs)">
|
||
<meta property="og:image" content="https://peps.python.org/_static/og-image.png">
|
||
<meta property="og:image:alt" content="Python PEPs">
|
||
<meta property="og:image:width" content="200">
|
||
<meta property="og:image:height" content="200">
|
||
<meta name="description" content="Change \x escapes, in both 8-bit and Unicode strings, to consume exactly the two hex digits following. The proposal views this as correcting an original design flaw, leading to clearer expression in all flavors of string, a cleaner Unicode story, bette...">
|
||
<meta name="theme-color" content="#3776ab">
|
||
</head>
|
||
<body>
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
|
||
<symbol id="svg-sun-half" viewBox="0 0 24 24" pointer-events="all">
|
||
<title>Following system colour scheme</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none"
|
||
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||
<circle cx="12" cy="12" r="9"></circle>
|
||
<path d="M12 3v18m0-12l4.65-4.65M12 14.3l7.37-7.37M12 19.6l8.85-8.85"></path>
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-moon" viewBox="0 0 24 24" pointer-events="all">
|
||
<title>Selected dark colour scheme</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none"
|
||
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||
<path stroke="none" d="M0 0h24v24H0z" fill="none"></path>
|
||
<path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z"></path>
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-sun" viewBox="0 0 24 24" pointer-events="all">
|
||
<title>Selected light colour scheme</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none"
|
||
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||
<circle cx="12" cy="12" r="5"></circle>
|
||
<line x1="12" y1="1" x2="12" y2="3"></line>
|
||
<line x1="12" y1="21" x2="12" y2="23"></line>
|
||
<line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
|
||
<line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
|
||
<line x1="1" y1="12" x2="3" y2="12"></line>
|
||
<line x1="21" y1="12" x2="23" y2="12"></line>
|
||
<line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
|
||
<line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
|
||
</svg>
|
||
</symbol>
|
||
</svg>
|
||
<script>
|
||
|
||
document.documentElement.dataset.colour_scheme = localStorage.getItem("colour_scheme") || "auto"
|
||
</script>
|
||
<section id="pep-page-section">
|
||
<header>
|
||
<h1>Python Enhancement Proposals</h1>
|
||
<ul class="breadcrumbs">
|
||
<li><a href="https://www.python.org/" title="The Python Programming Language">Python</a> » </li>
|
||
<li><a href="../pep-0000/">PEP Index</a> » </li>
|
||
<li>PEP 223</li>
|
||
</ul>
|
||
<button id="colour-scheme-cycler" onClick="setColourScheme(nextColourScheme())">
|
||
<svg aria-hidden="true" class="colour-scheme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
|
||
<svg aria-hidden="true" class="colour-scheme-icon-when-dark"><use href="#svg-moon"></use></svg>
|
||
<svg aria-hidden="true" class="colour-scheme-icon-when-light"><use href="#svg-sun"></use></svg>
|
||
<span class="visually-hidden">Toggle light / dark / auto colour theme</span>
|
||
</button>
|
||
</header>
|
||
<article>
|
||
<section id="pep-content">
|
||
<h1 class="page-title">PEP 223 – Change the Meaning of <code class="docutils literal notranslate"><span class="pre">\x</span></code> Escapes</h1>
|
||
<dl class="rfc2822 field-list simple">
|
||
<dt class="field-odd">Author<span class="colon">:</span></dt>
|
||
<dd class="field-odd">Tim Peters <tim.peters at gmail.com></dd>
|
||
<dt class="field-even">Status<span class="colon">:</span></dt>
|
||
<dd class="field-even"><abbr title="Accepted and implementation complete, or no longer active">Final</abbr></dd>
|
||
<dt class="field-odd">Type<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><abbr title="Normative PEP with a new feature for Python, implementation change for CPython or interoperability standard for the ecosystem">Standards Track</abbr></dd>
|
||
<dt class="field-even">Created<span class="colon">:</span></dt>
|
||
<dd class="field-even">20-Aug-2000</dd>
|
||
<dt class="field-odd">Python-Version<span class="colon">:</span></dt>
|
||
<dd class="field-odd">2.0</dd>
|
||
<dt class="field-even">Post-History<span class="colon">:</span></dt>
|
||
<dd class="field-even">23-Aug-2000</dd>
|
||
</dl>
|
||
<hr class="docutils" />
|
||
<section id="contents">
|
||
<details><summary>Table of Contents</summary><ul class="simple">
|
||
<li><a class="reference internal" href="#abstract">Abstract</a></li>
|
||
<li><a class="reference internal" href="#syntax">Syntax</a></li>
|
||
<li><a class="reference internal" href="#semantics">Semantics</a></li>
|
||
<li><a class="reference internal" href="#example">Example</a></li>
|
||
<li><a class="reference internal" href="#history-and-rationale">History and Rationale</a></li>
|
||
<li><a class="reference internal" href="#development-and-discussion">Development and Discussion</a></li>
|
||
<li><a class="reference internal" href="#backward-compatibility">Backward Compatibility</a></li>
|
||
<li><a class="reference internal" href="#effects-on-other-tools">Effects on Other Tools</a></li>
|
||
<li><a class="reference internal" href="#reference-implementation">Reference Implementation</a></li>
|
||
<li><a class="reference internal" href="#bdfl-pronouncements">BDFL Pronouncements</a></li>
|
||
<li><a class="reference internal" href="#references">References</a></li>
|
||
<li><a class="reference internal" href="#copyright">Copyright</a></li>
|
||
</ul>
|
||
</details></section>
|
||
<section id="abstract">
|
||
<h2><a class="toc-backref" href="#abstract" role="doc-backlink">Abstract</a></h2>
|
||
<p>Change <code class="docutils literal notranslate"><span class="pre">\x</span></code> escapes, in both 8-bit and Unicode strings, to consume
|
||
exactly the two hex digits following. The proposal views this as
|
||
correcting an original design flaw, leading to clearer expression
|
||
in all flavors of string, a cleaner Unicode story, better
|
||
compatibility with Perl regular expressions, and with minimal risk
|
||
to existing code.</p>
|
||
</section>
|
||
<section id="syntax">
|
||
<h2><a class="toc-backref" href="#syntax" role="doc-backlink">Syntax</a></h2>
|
||
<p>The syntax of <code class="docutils literal notranslate"><span class="pre">\x</span></code> escapes, in all flavors of non-raw strings, becomes</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>\<span class="n">xhh</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>where h is a hex digit (0-9, a-f, A-F). The exact syntax in 1.5.2 is
|
||
not clearly specified in the Reference Manual; it says</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>\<span class="n">xhh</span><span class="o">...</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>implying “two or more” hex digits, but one-digit forms are also
|
||
accepted by the 1.5.2 compiler, and a plain <code class="docutils literal notranslate"><span class="pre">\x</span></code> is “expanded” to
|
||
itself (i.e., a backslash followed by the letter x). It’s unclear
|
||
whether the Reference Manual intended either of the 1-digit or
|
||
0-digit behaviors.</p>
|
||
</section>
|
||
<section id="semantics">
|
||
<h2><a class="toc-backref" href="#semantics" role="doc-backlink">Semantics</a></h2>
|
||
<p>In an 8-bit non-raw string,</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>\<span class="n">xij</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>expands to the character</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nb">chr</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">ij</span><span class="p">,</span> <span class="mi">16</span><span class="p">))</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Note that this is the same as in 1.6 and before.</p>
|
||
<p>In a Unicode string,</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>\<span class="n">xij</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>acts the same as</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>\<span class="n">u00ij</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>i.e. it expands to the obvious Latin-1 character from the initial
|
||
segment of the Unicode space.</p>
|
||
<p>An <code class="docutils literal notranslate"><span class="pre">\x</span></code> not followed by at least two hex digits is a compile-time error,
|
||
specifically <code class="docutils literal notranslate"><span class="pre">ValueError</span></code> in 8-bit strings, and <code class="docutils literal notranslate"><span class="pre">UnicodeError</span></code> (a subclass
|
||
of <code class="docutils literal notranslate"><span class="pre">ValueError</span></code>) in Unicode strings. Note that if an <code class="docutils literal notranslate"><span class="pre">\x</span></code> is followed by
|
||
more than two hex digits, only the first two are “consumed”. In 1.6
|
||
and before all but the <em>last</em> two were silently ignored.</p>
|
||
</section>
|
||
<section id="example">
|
||
<h2><a class="toc-backref" href="#example" role="doc-backlink">Example</a></h2>
|
||
<p>In 1.5.2:</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="s2">"</span><span class="se">\x12</span><span class="s2">3465"</span> <span class="c1"># same as "\x65"</span>
|
||
<span class="go">'e'</span>
|
||
<span class="gp">>>> </span><span class="s2">"</span><span class="se">\x65</span><span class="s2">"</span>
|
||
<span class="go">'e'</span>
|
||
<span class="gp">>>> </span><span class="s2">"\x1"</span>
|
||
<span class="go">'\001'</span>
|
||
<span class="gp">>>> </span><span class="s2">"\x\x"</span>
|
||
<span class="go">'\\x\\x'</span>
|
||
<span class="gp">>>></span>
|
||
</pre></div>
|
||
</div>
|
||
<p>In 2.0:</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="s2">"</span><span class="se">\x12</span><span class="s2">3465"</span> <span class="c1"># \x12 -> \022, "3456" left alone</span>
|
||
<span class="go">'\0223456'</span>
|
||
<span class="gp">>>> </span><span class="s2">"</span><span class="se">\x65</span><span class="s2">"</span>
|
||
<span class="go">'e'</span>
|
||
<span class="gp">>>> </span><span class="s2">"\x1"</span>
|
||
<span class="go">[ValueError is raised]</span>
|
||
<span class="gp">>>> </span><span class="s2">"\x\x"</span>
|
||
<span class="go">[ValueError is raised]</span>
|
||
<span class="gp">>>></span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="history-and-rationale">
|
||
<h2><a class="toc-backref" href="#history-and-rationale" role="doc-backlink">History and Rationale</a></h2>
|
||
<p><code class="docutils literal notranslate"><span class="pre">\x</span></code> escapes were introduced in C as a way to specify variable-width
|
||
character encodings. Exactly which encodings those were, and how many
|
||
hex digits they required, was left up to each implementation. The
|
||
language simply stated that <code class="docutils literal notranslate"><span class="pre">\x</span></code> “consumed” <em>all</em> hex digits following,
|
||
and left the meaning up to each implementation. So, in effect, <code class="docutils literal notranslate"><span class="pre">\x</span></code> in C
|
||
is a standard hook to supply platform-defined behavior.</p>
|
||
<p>Because Python explicitly aims at platform independence, the <code class="docutils literal notranslate"><span class="pre">\x</span></code> escape
|
||
in Python (up to and including 1.6) has been treated the same way
|
||
across all platforms: all <em>except</em> the last two hex digits were
|
||
silently ignored. So the only actual use for <code class="docutils literal notranslate"><span class="pre">\x</span></code> escapes in Python was
|
||
to specify a single byte using hex notation.</p>
|
||
<p>Larry Wall appears to have realized that this was the only real use for
|
||
<code class="docutils literal notranslate"><span class="pre">\x</span></code> escapes in a platform-independent language, as the proposed rule for
|
||
Python 2.0 is in fact what Perl has done from the start (although you
|
||
need to run in Perl -w mode to get warned about <code class="docutils literal notranslate"><span class="pre">\x</span></code> escapes with fewer
|
||
than 2 hex digits following – it’s clearly more Pythonic to insist on
|
||
2 all the time).</p>
|
||
<p>When Unicode strings were introduced to Python, <code class="docutils literal notranslate"><span class="pre">\x</span></code> was generalized so
|
||
as to ignore all but the last <em>four</em> hex digits in Unicode strings.
|
||
This caused a technical difficulty for the new regular expression engine:
|
||
SRE tries very hard to allow mixing 8-bit and Unicode patterns and
|
||
strings in intuitive ways, and it no longer had any way to guess what,
|
||
for example, <code class="docutils literal notranslate"><span class="pre">r"\x123456"</span></code> should mean as a pattern: is it asking to match
|
||
the 8-bit character <code class="docutils literal notranslate"><span class="pre">\x56</span></code> or the Unicode character <code class="docutils literal notranslate"><span class="pre">\u3456</span></code>?</p>
|
||
<p>There are hacky ways to guess, but it doesn’t end there. The ISO C99
|
||
standard also introduces 8-digit <code class="docutils literal notranslate"><span class="pre">\U12345678</span></code> escapes to cover the entire
|
||
ISO 10646 character space, and it’s also desired that Python 2 support
|
||
that from the start. But then what are <code class="docutils literal notranslate"><span class="pre">\x</span></code> escapes supposed to mean?
|
||
Do they ignore all but the last <em>eight</em> hex digits then? And if less
|
||
than 8 following in a Unicode string, all but the last 4? And if less
|
||
than 4, all but the last 2?</p>
|
||
<p>This was getting messier by the minute, and the proposal cuts the
|
||
Gordian knot by making <code class="docutils literal notranslate"><span class="pre">\x</span></code> simpler instead of more complicated. Note
|
||
that the 4-digit generalization to <code class="docutils literal notranslate"><span class="pre">\xijkl</span></code> in Unicode strings was also
|
||
redundant, because it meant exactly the same thing as <code class="docutils literal notranslate"><span class="pre">\uijkl</span></code> in Unicode
|
||
strings. It’s more Pythonic to have just one obvious way to specify a
|
||
Unicode character via hex notation.</p>
|
||
</section>
|
||
<section id="development-and-discussion">
|
||
<h2><a class="toc-backref" href="#development-and-discussion" role="doc-backlink">Development and Discussion</a></h2>
|
||
<p>The proposal was worked out among Guido van Rossum, Fredrik Lundh and
|
||
Tim Peters in email. It was subsequently explained and discussed on
|
||
Python-Dev under subject “Go x yourself” <a class="footnote-reference brackets" href="#id2" id="id1">[1]</a>, starting 2000-08-03.
|
||
Response was overwhelmingly positive; no objections were raised.</p>
|
||
</section>
|
||
<section id="backward-compatibility">
|
||
<h2><a class="toc-backref" href="#backward-compatibility" role="doc-backlink">Backward Compatibility</a></h2>
|
||
<p>Changing the meaning of <code class="docutils literal notranslate"><span class="pre">\x</span></code> escapes does carry risk of breaking existing
|
||
code, although no instances of incompatibility have yet been discovered.
|
||
The risk is believed to be minimal.</p>
|
||
<p>Tim Peters verified that, except for pieces of the standard test suite
|
||
deliberately provoking end cases, there are no instances of <code class="docutils literal notranslate"><span class="pre">\xabcdef...</span></code>
|
||
with fewer or more than 2 hex digits following, in either the Python
|
||
CVS development tree, or in assorted Python packages sitting on his
|
||
machine.</p>
|
||
<p>It’s unlikely there are any with fewer than 2, because the Reference
|
||
Manual implied they weren’t legal (although this is debatable!). If
|
||
there are any with more than 2, Guido is ready to argue they were buggy
|
||
anyway <0.9 wink>.</p>
|
||
<p>Guido reported that the O’Reilly Python books <em>already</em> document that
|
||
Python works the proposed way, likely due to their Perl editing
|
||
heritage (as above, Perl worked (very close to) the proposed way from
|
||
its start).</p>
|
||
<p>Finn Bock reported that what JPython does with <code class="docutils literal notranslate"><span class="pre">\x</span></code> escapes is
|
||
unpredictable today. This proposal gives a clear meaning that can be
|
||
consistently and easily implemented across all Python implementations.</p>
|
||
</section>
|
||
<section id="effects-on-other-tools">
|
||
<h2><a class="toc-backref" href="#effects-on-other-tools" role="doc-backlink">Effects on Other Tools</a></h2>
|
||
<p>Believed to be none. The candidates for breakage would mostly be
|
||
parsing tools, but the author knows of none that worry about the
|
||
internal structure of Python strings beyond the approximation “when
|
||
there’s a backslash, swallow the next character”. Tim Peters checked
|
||
<code class="docutils literal notranslate"><span class="pre">python-mode.el</span></code>, the std <code class="docutils literal notranslate"><span class="pre">tokenize.py</span></code> and <code class="docutils literal notranslate"><span class="pre">pyclbr.py</span></code>, and the IDLE syntax
|
||
coloring subsystem, and believes there’s no need to change any of
|
||
them. Tools like <code class="docutils literal notranslate"><span class="pre">tabnanny.py</span></code> and <code class="docutils literal notranslate"><span class="pre">checkappend.py</span></code> inherit their immunity
|
||
from <code class="docutils literal notranslate"><span class="pre">tokenize.py</span></code>.</p>
|
||
</section>
|
||
<section id="reference-implementation">
|
||
<h2><a class="toc-backref" href="#reference-implementation" role="doc-backlink">Reference Implementation</a></h2>
|
||
<p>The code changes are so simple that a separate patch will not be produced.
|
||
Fredrik Lundh is writing the code, is an expert in the area, and will
|
||
simply check the changes in before 2.0b1 is released.</p>
|
||
</section>
|
||
<section id="bdfl-pronouncements">
|
||
<h2><a class="toc-backref" href="#bdfl-pronouncements" role="doc-backlink">BDFL Pronouncements</a></h2>
|
||
<p>Yes, <code class="docutils literal notranslate"><span class="pre">ValueError</span></code>, not <code class="docutils literal notranslate"><span class="pre">SyntaxError</span></code>. “Problems with literal interpretations
|
||
traditionally raise ‘runtime’ exceptions rather than syntax errors.”</p>
|
||
</section>
|
||
<section id="references">
|
||
<h2><a class="toc-backref" href="#references" role="doc-backlink">References</a></h2>
|
||
<aside class="footnote-list brackets">
|
||
<aside class="footnote brackets" id="id2" role="doc-footnote">
|
||
<dt class="label" id="id2">[<a href="#id1">1</a>]</dt>
|
||
<dd>Tim Peters, Go x yourself
|
||
<a class="reference external" href="https://mail.python.org/pipermail/python-dev/2000-August/007825.html">https://mail.python.org/pipermail/python-dev/2000-August/007825.html</a></aside>
|
||
</aside>
|
||
</section>
|
||
<section id="copyright">
|
||
<h2><a class="toc-backref" href="#copyright" role="doc-backlink">Copyright</a></h2>
|
||
<p>This document has been placed in the public domain.</p>
|
||
</section>
|
||
</section>
|
||
<hr class="docutils" />
|
||
<p>Source: <a class="reference external" href="https://github.com/python/peps/blob/main/peps/pep-0223.rst">https://github.com/python/peps/blob/main/peps/pep-0223.rst</a></p>
|
||
<p>Last modified: <a class="reference external" href="https://github.com/python/peps/commits/main/peps/pep-0223.rst">2023-09-09 17:39:29 GMT</a></p>
|
||
|
||
</article>
|
||
<nav id="pep-sidebar">
|
||
<h2>Contents</h2>
|
||
<ul>
|
||
<li><a class="reference internal" href="#abstract">Abstract</a></li>
|
||
<li><a class="reference internal" href="#syntax">Syntax</a></li>
|
||
<li><a class="reference internal" href="#semantics">Semantics</a></li>
|
||
<li><a class="reference internal" href="#example">Example</a></li>
|
||
<li><a class="reference internal" href="#history-and-rationale">History and Rationale</a></li>
|
||
<li><a class="reference internal" href="#development-and-discussion">Development and Discussion</a></li>
|
||
<li><a class="reference internal" href="#backward-compatibility">Backward Compatibility</a></li>
|
||
<li><a class="reference internal" href="#effects-on-other-tools">Effects on Other Tools</a></li>
|
||
<li><a class="reference internal" href="#reference-implementation">Reference Implementation</a></li>
|
||
<li><a class="reference internal" href="#bdfl-pronouncements">BDFL Pronouncements</a></li>
|
||
<li><a class="reference internal" href="#references">References</a></li>
|
||
<li><a class="reference internal" href="#copyright">Copyright</a></li>
|
||
</ul>
|
||
|
||
<br>
|
||
<a id="source" href="https://github.com/python/peps/blob/main/peps/pep-0223.rst">Page Source (GitHub)</a>
|
||
</nav>
|
||
</section>
|
||
<script src="../_static/colour_scheme.js"></script>
|
||
<script src="../_static/wrap_tables.js"></script>
|
||
<script src="../_static/sticky_banner.js"></script>
|
||
</body>
|
||
</html> |