904 lines
82 KiB
HTML
904 lines
82 KiB
HTML
|
||
<!DOCTYPE html>
|
||
<html lang="en">
|
||
<head>
|
||
<meta charset="utf-8">
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||
<meta name="color-scheme" content="light dark">
|
||
<title>PEP 617 – New PEG parser for CPython | peps.python.org</title>
|
||
<link rel="shortcut icon" href="../_static/py.png">
|
||
<link rel="canonical" href="https://peps.python.org/pep-0617/">
|
||
<link rel="stylesheet" href="../_static/style.css" type="text/css">
|
||
<link rel="stylesheet" href="../_static/mq.css" type="text/css">
|
||
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" media="(prefers-color-scheme: light)" id="pyg-light">
|
||
<link rel="stylesheet" href="../_static/pygments_dark.css" type="text/css" media="(prefers-color-scheme: dark)" id="pyg-dark">
|
||
<link rel="alternate" type="application/rss+xml" title="Latest PEPs" href="https://peps.python.org/peps.rss">
|
||
<meta property="og:title" content='PEP 617 – New PEG parser for CPython | peps.python.org'>
|
||
<meta property="og:description" content="Python Enhancement Proposals (PEPs)">
|
||
<meta property="og:type" content="website">
|
||
<meta property="og:url" content="https://peps.python.org/pep-0617/">
|
||
<meta property="og:site_name" content="Python Enhancement Proposals (PEPs)">
|
||
<meta property="og:image" content="https://peps.python.org/_static/og-image.png">
|
||
<meta property="og:image:alt" content="Python PEPs">
|
||
<meta property="og:image:width" content="200">
|
||
<meta property="og:image:height" content="200">
|
||
<meta name="description" content="Python Enhancement Proposals (PEPs)">
|
||
<meta name="theme-color" content="#3776ab">
|
||
</head>
|
||
<body>
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
|
||
<symbol id="svg-sun-half" viewBox="0 0 24 24" pointer-events="all">
|
||
<title>Following system colour scheme</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none"
|
||
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||
<circle cx="12" cy="12" r="9"></circle>
|
||
<path d="M12 3v18m0-12l4.65-4.65M12 14.3l7.37-7.37M12 19.6l8.85-8.85"></path>
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-moon" viewBox="0 0 24 24" pointer-events="all">
|
||
<title>Selected dark colour scheme</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none"
|
||
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||
<path stroke="none" d="M0 0h24v24H0z" fill="none"></path>
|
||
<path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z"></path>
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-sun" viewBox="0 0 24 24" pointer-events="all">
|
||
<title>Selected light colour scheme</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none"
|
||
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||
<circle cx="12" cy="12" r="5"></circle>
|
||
<line x1="12" y1="1" x2="12" y2="3"></line>
|
||
<line x1="12" y1="21" x2="12" y2="23"></line>
|
||
<line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
|
||
<line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
|
||
<line x1="1" y1="12" x2="3" y2="12"></line>
|
||
<line x1="21" y1="12" x2="23" y2="12"></line>
|
||
<line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
|
||
<line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
|
||
</svg>
|
||
</symbol>
|
||
</svg>
|
||
<script>
|
||
|
||
document.documentElement.dataset.colour_scheme = localStorage.getItem("colour_scheme") || "auto"
|
||
</script>
|
||
<section id="pep-page-section">
|
||
<header>
|
||
<h1>Python Enhancement Proposals</h1>
|
||
<ul class="breadcrumbs">
|
||
<li><a href="https://www.python.org/" title="The Python Programming Language">Python</a> » </li>
|
||
<li><a href="../pep-0000/">PEP Index</a> » </li>
|
||
<li>PEP 617</li>
|
||
</ul>
|
||
<button id="colour-scheme-cycler" onClick="setColourScheme(nextColourScheme())">
|
||
<svg aria-hidden="true" class="colour-scheme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
|
||
<svg aria-hidden="true" class="colour-scheme-icon-when-dark"><use href="#svg-moon"></use></svg>
|
||
<svg aria-hidden="true" class="colour-scheme-icon-when-light"><use href="#svg-sun"></use></svg>
|
||
<span class="visually-hidden">Toggle light / dark / auto colour theme</span>
|
||
</button>
|
||
</header>
|
||
<article>
|
||
<section id="pep-content">
|
||
<h1 class="page-title">PEP 617 – New PEG parser for CPython</h1>
|
||
<dl class="rfc2822 field-list simple">
|
||
<dt class="field-odd">Author<span class="colon">:</span></dt>
|
||
<dd class="field-odd">Guido van Rossum <guido at python.org>,
|
||
Pablo Galindo <pablogsal at python.org>,
|
||
Lysandros Nikolaou <lisandrosnik at gmail.com></dd>
|
||
<dt class="field-even">Discussions-To<span class="colon">:</span></dt>
|
||
<dd class="field-even"><a class="reference external" href="https://mail.python.org/archives/list/python-dev@python.org/">Python-Dev list</a></dd>
|
||
<dt class="field-odd">Status<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><abbr title="Accepted and implementation complete, or no longer active">Final</abbr></dd>
|
||
<dt class="field-even">Type<span class="colon">:</span></dt>
|
||
<dd class="field-even"><abbr title="Normative PEP with a new feature for Python, implementation change for CPython or interoperability standard for the ecosystem">Standards Track</abbr></dd>
|
||
<dt class="field-odd">Created<span class="colon">:</span></dt>
|
||
<dd class="field-odd">24-Mar-2020</dd>
|
||
<dt class="field-even">Python-Version<span class="colon">:</span></dt>
|
||
<dd class="field-even">3.9</dd>
|
||
<dt class="field-odd">Post-History<span class="colon">:</span></dt>
|
||
<dd class="field-odd">02-Apr-2020</dd>
|
||
</dl>
|
||
<hr class="docutils" />
|
||
<section id="contents">
|
||
<details><summary>Table of Contents</summary><ul class="simple">
|
||
<li><a class="reference internal" href="#overview">Overview</a></li>
|
||
<li><a class="reference internal" href="#background-on-ll-1-parsers">Background on LL(1) parsers</a></li>
|
||
<li><a class="reference internal" href="#background-on-peg-parsers">Background on PEG parsers</a></li>
|
||
<li><a class="reference internal" href="#rationale">Rationale</a><ul>
|
||
<li><a class="reference internal" href="#some-rules-are-not-actually-ll-1">Some rules are not actually LL(1)</a></li>
|
||
<li><a class="reference internal" href="#complicated-ast-parsing">Complicated AST parsing</a></li>
|
||
<li><a class="reference internal" href="#lack-of-left-recursion">Lack of left recursion</a></li>
|
||
<li><a class="reference internal" href="#intermediate-parse-tree">Intermediate parse tree</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a class="reference internal" href="#the-new-proposed-peg-parser">The new proposed PEG parser</a><ul>
|
||
<li><a class="reference internal" href="#left-recursion">Left recursion</a></li>
|
||
<li><a class="reference internal" href="#syntax">Syntax</a><ul>
|
||
<li><a class="reference internal" href="#grammar-expressions">Grammar Expressions</a><ul>
|
||
<li><a class="reference internal" href="#comment"><code class="docutils literal notranslate"><span class="pre">#</span> <span class="pre">comment</span></code></a></li>
|
||
<li><a class="reference internal" href="#e1-e2"><code class="docutils literal notranslate"><span class="pre">e1</span> <span class="pre">e2</span></code></a></li>
|
||
<li><a class="reference internal" href="#e1-e2-1"><code class="docutils literal notranslate"><span class="pre">e1</span> <span class="pre">|</span> <span class="pre">e2</span></code></a></li>
|
||
<li><a class="reference internal" href="#e"><code class="docutils literal notranslate"><span class="pre">(</span> <span class="pre">e</span> <span class="pre">)</span></code></a></li>
|
||
<li><a class="reference internal" href="#e-or-e"><code class="docutils literal notranslate"><span class="pre">[</span> <span class="pre">e</span> <span class="pre">]</span> <span class="pre">or</span> <span class="pre">e?</span></code></a></li>
|
||
<li><a class="reference internal" href="#e-1"><code class="docutils literal notranslate"><span class="pre">e*</span></code></a></li>
|
||
<li><a class="reference internal" href="#e-2"><code class="docutils literal notranslate"><span class="pre">e+</span></code></a></li>
|
||
<li><a class="reference internal" href="#s-e"><code class="docutils literal notranslate"><span class="pre">s.e+</span></code></a></li>
|
||
<li><a class="reference internal" href="#e-3"><code class="docutils literal notranslate"><span class="pre">&e</span></code></a></li>
|
||
<li><a class="reference internal" href="#e-4"><code class="docutils literal notranslate"><span class="pre">!e</span></code></a></li>
|
||
<li><a class="reference internal" href="#e-5"><code class="docutils literal notranslate"><span class="pre">~</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a class="reference internal" href="#variables-in-the-grammar">Variables in the Grammar</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a class="reference internal" href="#grammar-actions">Grammar actions</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a class="reference internal" href="#migration-plan">Migration plan</a></li>
|
||
<li><a class="reference internal" href="#performance-and-validation">Performance and validation</a><ul>
|
||
<li><a class="reference internal" href="#validation">Validation</a></li>
|
||
<li><a class="reference internal" href="#performance">Performance</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a class="reference internal" href="#rejected-alternatives">Rejected Alternatives</a></li>
|
||
<li><a class="reference internal" href="#references">References</a></li>
|
||
<li><a class="reference internal" href="#copyright">Copyright</a></li>
|
||
</ul>
|
||
</details></section>
|
||
<div class="pep-banner canonical-doc sticky-banner admonition important">
|
||
<p class="admonition-title">Important</p>
|
||
<p>This PEP is a historical document. The up-to-date, canonical documentation can now be found at <a class="reference external" href="https://docs.python.org/3/reference/grammar.html#full-grammar-specification" title="(in Python v3.13)"><span>Full Grammar specification</span></a>.</p>
|
||
<p class="close-button">×</p>
|
||
<p>See <a class="pep reference internal" href="../pep-0001/" title="PEP 1 – PEP Purpose and Guidelines">PEP 1</a> for how to propose changes.</p>
|
||
</div>
|
||
<section id="overview">
|
||
<h2><a class="toc-backref" href="#overview" role="doc-backlink">Overview</a></h2>
|
||
<p>This PEP proposes replacing the current LL(1)-based parser of CPython
|
||
with a new PEG-based parser. This new parser would allow the elimination of multiple
|
||
“hacks” that exist in the current grammar to circumvent the LL(1)-limitation.
|
||
It would substantially reduce the maintenance costs in some areas related to the
|
||
compiling pipeline such as the grammar, the parser and the AST generation. The new PEG
|
||
parser will also lift the LL(1) restriction on the current Python grammar.</p>
|
||
</section>
|
||
<section id="background-on-ll-1-parsers">
|
||
<h2><a class="toc-backref" href="#background-on-ll-1-parsers" role="doc-backlink">Background on LL(1) parsers</a></h2>
|
||
<p>The current Python grammar is an LL(1)-based grammar. A grammar can be said to be
|
||
LL(1) if it can be parsed by an LL(1) parser, which in turn is defined as a
|
||
top-down parser that parses the input from left to right, performing leftmost
|
||
derivation of the sentence, with just one token of lookahead.
|
||
The traditional approach to constructing or generating an LL(1) parser is to
|
||
produce a <em>parse table</em> which encodes the possible transitions between all possible
|
||
states of the parser. These tables are normally constructed from the <em>first sets</em>
|
||
and the <em>follow sets</em> of the grammar:</p>
|
||
<ul>
|
||
<li>Given a rule, the <em>first set</em> is the collection of all terminals that can occur
|
||
first in a full derivation of that rule. Intuitively, this helps the parser decide
|
||
among the alternatives in a rule. For
|
||
instance, given the rule:<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule</span><span class="o">:</span> <span class="nc">A</span> <span class="o">|</span> <span class="nc">B</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>if only <code class="docutils literal notranslate"><span class="pre">A</span></code> can start with the terminal <em>a</em> and only <code class="docutils literal notranslate"><span class="pre">B</span></code> can start with the
|
||
terminal <em>b</em> and the parser sees the token <em>b</em> when parsing this rule, it knows
|
||
that it needs to follow the non-terminal <code class="docutils literal notranslate"><span class="pre">B</span></code>.</p>
|
||
</li>
|
||
<li>An extension to this simple idea is needed when a rule may expand to the empty string.
|
||
Given a rule, the <em>follow set</em> is the collection of terminals that can appear
|
||
immediately to the right of that rule in a partial derivation. Intuitively, this
|
||
solves the problem of the empty alternative. For instance,
|
||
given this rule:<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule</span><span class="o">:</span> <span class="nc">A</span> <span class="s1">'b'</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>if the parser has the token <em>b</em> and the non-terminal <code class="docutils literal notranslate"><span class="pre">A</span></code> can only start
|
||
with the token <em>a</em>, then the parser can tell that this is an invalid program.
|
||
But if <code class="docutils literal notranslate"><span class="pre">A</span></code> could expand to the empty string (called an ε-production),
|
||
then the parser would recognise a valid empty <code class="docutils literal notranslate"><span class="pre">A</span></code>,
|
||
since the next token <em>b</em> is in the <em>follow set</em> of <code class="docutils literal notranslate"><span class="pre">A</span></code>.</p>
|
||
</li>
|
||
</ul>
|
||
<p>The current Python grammar does not contain ε-productions, so the <em>follow sets</em> are not
|
||
needed when creating the parse tables. Currently, in CPython, a parser generator
|
||
program reads the grammar and produces a parsing table representing a set of
|
||
deterministic finite automata (DFA) that can be included in a C program, the
|
||
parser. The parser is a pushdown automaton that uses this data to produce a Concrete
|
||
Syntax Tree (CST) sometimes known directly as a “parse tree”. In this process, the
|
||
<em>first sets</em> are used indirectly when generating the DFAs.</p>
|
||
<p>LL(1) parsers and grammars are usually efficient and simple to implement
|
||
and generate. However, it is not possible, under the LL(1) restriction,
|
||
to express certain common constructs in a way natural to the language
|
||
designer and the reader. This includes some in the Python language.</p>
|
||
<p>As LL(1) parsers can only look one token ahead to distinguish
|
||
possibilities, some rules in the grammar may be ambiguous. For instance the rule:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule</span><span class="o">:</span> <span class="nc">A</span> <span class="o">|</span> <span class="nc">B</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>is ambiguous if the <em>first sets</em> of both <code class="docutils literal notranslate"><span class="pre">A</span></code> and <code class="docutils literal notranslate"><span class="pre">B</span></code> have some elements in
|
||
common. When the parser sees a token in the input
|
||
program that both <em>A</em> and <em>B</em> can start with, it is impossible for it to deduce
|
||
which option to expand, as no further token of the program can be examined to
|
||
disambiguate.
|
||
The rule may be transformed to equivalent LL(1) rules, but then it may
|
||
be harder for a human reader to grasp its meaning.
|
||
Examples later in this document show that the current LL(1)-based
|
||
grammar suffers a lot from this scenario.</p>
|
||
<p>Another broad class of rules precluded by LL(1) is left-recursive rules.
|
||
A rule is left-recursive if it can derive to a
|
||
sentential form with itself as the leftmost symbol. For instance this rule:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule</span><span class="o">:</span> <span class="nc">rule</span> <span class="s1">'a'</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>is left-recursive because the rule can be expanded to an expression that starts
|
||
with itself. As will be described later, left-recursion is the natural way to
|
||
express certain desired language properties directly in the grammar.</p>
|
||
</section>
|
||
<section id="background-on-peg-parsers">
|
||
<h2><a class="toc-backref" href="#background-on-peg-parsers" role="doc-backlink">Background on PEG parsers</a></h2>
|
||
<p>A PEG (Parsing Expression Grammar) grammar differs from a context-free grammar
|
||
(like the current one) in the fact that the way it is written more closely
|
||
reflects how the parser will operate when parsing it. The fundamental technical
|
||
difference is that the choice operator is ordered. This means that when writing:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule</span><span class="o">:</span> <span class="nc">A</span> <span class="o">|</span> <span class="nc">B</span> <span class="o">|</span> <span class="nc">C</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>a context-free-grammar parser (like an LL(1) parser) will generate constructions
|
||
that given an input string will <em>deduce</em> which alternative (<code class="docutils literal notranslate"><span class="pre">A</span></code>, <code class="docutils literal notranslate"><span class="pre">B</span></code> or <code class="docutils literal notranslate"><span class="pre">C</span></code>)
|
||
must be expanded, while a PEG parser will check if the first alternative succeeds
|
||
and only if it fails, will it continue with the second or the third one in the
|
||
order in which they are written. This makes the choice operator not commutative.</p>
|
||
<p>Unlike LL(1) parsers, PEG-based parsers cannot be ambiguous: if a string parses,
|
||
it has exactly one valid parse tree. This means that a PEG-based parser cannot
|
||
suffer from the ambiguity problems described in the previous section.</p>
|
||
<p>PEG parsers are usually constructed as a recursive descent parser in which every
|
||
rule in the grammar corresponds to a function in the program implementing the
|
||
parser and the parsing expression (the “expansion” or “definition” of the rule)
|
||
represents the “code” in said function. Each parsing function conceptually takes
|
||
an input string as its argument, and yields one of the following results:</p>
|
||
<ul class="simple">
|
||
<li>A “success” result. This result indicates that the expression can be parsed by
|
||
that rule and the function may optionally move forward or consume one or more
|
||
characters of the input string supplied to it.</li>
|
||
<li>A “failure” result, in which case no input is consumed.</li>
|
||
</ul>
|
||
<p>Notice that “failure” results do not imply that the program is incorrect or a
|
||
parsing failure because as the choice operator is ordered, a “failure” result
|
||
merely indicates “try the following option”. A direct implementation of a PEG
|
||
parser as a recursive descent parser will present exponential time performance in
|
||
the worst case as compared with LL(1) parsers, because PEG parsers have infinite lookahead
|
||
(this means that they can consider an arbitrary number of tokens before deciding
|
||
for a rule). Usually, PEG parsers avoid this exponential time complexity with a
|
||
technique called “packrat parsing” <a class="footnote-reference brackets" href="#id10" id="id1">[1]</a> which not only loads the entire
|
||
program in memory before parsing it but also allows the parser to backtrack
|
||
arbitrarily. This is made efficient by memoizing the rules already matched for
|
||
each position. The cost of the memoization cache is that the parser will naturally
|
||
use more memory than a simple LL(1) parser, which normally are table-based. We
|
||
will explain later in this document why we consider this cost acceptable.</p>
|
||
</section>
|
||
<section id="rationale">
|
||
<h2><a class="toc-backref" href="#rationale" role="doc-backlink">Rationale</a></h2>
|
||
<p>In this section, we describe a list of problems that are present in the current parser
|
||
machinery in CPython that motivates the need for a new parser.</p>
|
||
<section id="some-rules-are-not-actually-ll-1">
|
||
<h3><a class="toc-backref" href="#some-rules-are-not-actually-ll-1" role="doc-backlink">Some rules are not actually LL(1)</a></h3>
|
||
<p>Although the Python grammar is technically an LL(1) grammar (because it is parsed by
|
||
an LL(1) parser) several rules are not LL(1) and several workarounds are
|
||
implemented in the grammar and in other parts of CPython to deal with this. For
|
||
example, consider the rule for assignment expressions:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">namedexpr_test</span><span class="o">:</span> <span class="p">[</span><span class="s">NAME ':='</span><span class="p">]</span> <span class="nc">test</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>This simple rule is not compatible with the Python grammar as <em>NAME</em> is among the
|
||
elements of the <em>first set</em> of the rule <em>test</em>. To work around this limitation the
|
||
actual rule that appears in the current grammar is:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">namedexpr_test</span><span class="o">:</span> <span class="nc">test</span> <span class="p">[</span><span class="s">':=' test</span><span class="p">]</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Which is a much broader rule than the previous one allowing constructs like <code class="docutils literal notranslate"><span class="pre">[x</span>
|
||
<span class="pre">for</span> <span class="pre">x</span> <span class="pre">in</span> <span class="pre">y]</span> <span class="pre">:=</span> <span class="pre">[1,2,3]</span></code>. The way the rule is limited to its desired form is by
|
||
disallowing these unwanted constructions when transforming the parse tree to the
|
||
abstract syntax tree. This is not only inelegant but a considerable maintenance
|
||
burden as it forces the AST creation routines and the compiler into a situation in
|
||
which they need to know how to separate valid programs from invalid programs,
|
||
which should be a responsibility solely of the parser. This also leads to the
|
||
actual grammar file not reflecting correctly what the <em>actual</em> grammar is (that
|
||
is, the collection of all valid Python programs).</p>
|
||
<p>Similar workarounds appear in multiple other rules of the current grammar.
|
||
Sometimes this problem is unsolvable. For instance, <a class="reference external" href="https://github.com/python/cpython/issues/56991">bpo-12782: Multiple context
|
||
expressions do not support parentheses for continuation across lines</a> shows how making an LL(1) rule that supports
|
||
writing:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">with</span> <span class="p">(</span>
|
||
<span class="nc">open</span><span class="p">(</span><span class="s2">"a_really_long_foo"</span><span class="p">)</span> <span class="nc">as</span> <span class="nc">foo,</span>
|
||
<span class="nc">open</span><span class="p">(</span><span class="s2">"a_really_long_baz"</span><span class="p">)</span> <span class="nc">as</span> <span class="nc">baz,</span>
|
||
<span class="nc">open</span><span class="p">(</span><span class="s2">"a_really_long_bar"</span><span class="p">)</span> <span class="nc">as</span> <span class="nc">bar</span>
|
||
<span class="p">)</span><span class="o">:</span>
|
||
<span class="k">...</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>is not possible since the first sets of the grammar items that can
|
||
appear as context managers include the open parenthesis, making the rule
|
||
ambiguous. This rule is not only consistent with other parts of the language (like
|
||
the rule for multiple imports), but is also very useful to auto-formatting tools,
|
||
as parenthesized groups are normally used to group elements to be
|
||
formatted together (in the same way the tools operate on the contents of lists,
|
||
sets…).</p>
|
||
</section>
|
||
<section id="complicated-ast-parsing">
|
||
<h3><a class="toc-backref" href="#complicated-ast-parsing" role="doc-backlink">Complicated AST parsing</a></h3>
|
||
<p>Another problem of the current parser is that there is a huge coupling between the
|
||
AST generation routines and the particular shape of the produced parse trees. This
|
||
makes the code for generating the AST especially complicated as many actions and
|
||
choices are implicit. For instance, the AST generation code knows what
|
||
alternatives of a certain rule are produced based on the number of child nodes
|
||
present in a given parse node. This makes the code difficult to follow as this
|
||
property is not directly related to the grammar file and is influenced by
|
||
implementation details. As a result of this, a considerable amount of the AST
|
||
generation code needs to deal with inspecting and reasoning about the particular
|
||
shape of the parse trees that it receives.</p>
|
||
</section>
|
||
<section id="lack-of-left-recursion">
|
||
<h3><a class="toc-backref" href="#lack-of-left-recursion" role="doc-backlink">Lack of left recursion</a></h3>
|
||
<p>As described previously, a limitation of LL(1) grammars is that they cannot allow
|
||
left-recursion. This makes writing some rules very unnatural and far from how
|
||
programmers normally think about the program. For instance this construct (a simpler
|
||
variation of several rules present in the current grammar):</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">expr</span><span class="o">:</span> <span class="nc">expr</span> <span class="s1">'+'</span> <span class="nc">term</span> <span class="o">|</span> <span class="nc">term</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>cannot be parsed by an LL(1) parser. The traditional remedy is to rewrite the
|
||
grammar to circumvent the problem:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">expr</span><span class="o">:</span> <span class="nc">term</span> <span class="p">(</span><span class="s1">'+'</span> <span class="nc">term</span><span class="p">)</span><span class="o">*</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>The problem that appears with this form is that the parse tree is forced to have a
|
||
very unnatural shape. This is because with this rule, for the input program <code class="docutils literal notranslate"><span class="pre">a</span> <span class="pre">+</span>
|
||
<span class="pre">b</span> <span class="pre">+</span> <span class="pre">c</span></code> the parse tree will be flattened (<code class="docutils literal notranslate"><span class="pre">['a',</span> <span class="pre">'+',</span> <span class="pre">'b',</span> <span class="pre">'+',</span> <span class="pre">'c']</span></code>) and must
|
||
be post-processed to construct a left-recursive parse tree (<code class="docutils literal notranslate"><span class="pre">[['a',</span> <span class="pre">'+',</span> <span class="pre">'b'],</span>
|
||
<span class="pre">'+',</span> <span class="pre">'c']</span></code>). Being forced to write the second rule not only leads to the parse
|
||
tree not correctly reflecting the desired associativity, but also imposes further
|
||
pressure on later compilation stages to detect and post-process these cases.</p>
|
||
</section>
|
||
<section id="intermediate-parse-tree">
|
||
<h3><a class="toc-backref" href="#intermediate-parse-tree" role="doc-backlink">Intermediate parse tree</a></h3>
|
||
<p>The last problem present in the current parser is the intermediate creation of a
|
||
parse tree or Concrete Syntax Tree that is later transformed to an Abstract Syntax
|
||
Tree. Although the construction of a CST is very common in parser and compiler
|
||
pipelines, in CPython this intermediate CST is not used by anything else (it is
|
||
only indirectly exposed by the <em>parser</em> module and a surprisingly small part of
|
||
the code in the CST production is reused in the module). Which is worse: the whole
|
||
tree is kept in memory, keeping many branches that consist of chains of nodes with
|
||
a single child. This has been shown to consume a considerable amount of memory (for
|
||
instance in <a class="reference external" href="https://github.com/python/cpython/issues/70603">bpo-26415: Excessive peak memory consumption by the Python
|
||
parser</a>).</p>
|
||
<p>Having to produce an intermediate result between the grammar and the AST is not only
|
||
undesirable but also makes the AST generation step much more complicated, raising
|
||
considerably the maintenance burden.</p>
|
||
</section>
|
||
</section>
|
||
<section id="the-new-proposed-peg-parser">
|
||
<h2><a class="toc-backref" href="#the-new-proposed-peg-parser" role="doc-backlink">The new proposed PEG parser</a></h2>
|
||
<p>The new proposed PEG parser contains the following pieces:</p>
|
||
<ul class="simple">
|
||
<li>A parser generator that can read a grammar file and produce a PEG parser
|
||
written in Python or C that can parse said grammar.</li>
|
||
<li>A PEG meta-grammar that automatically generates a Python parser that is used
|
||
for the parser generator itself (this means that there are no manually-written
|
||
parsers).</li>
|
||
<li>A generated parser (using the parser generator) that can directly produce C and
|
||
Python AST objects.</li>
|
||
</ul>
|
||
<section id="left-recursion">
|
||
<h3><a class="toc-backref" href="#left-recursion" role="doc-backlink">Left recursion</a></h3>
|
||
<p>PEG parsers normally do not support left recursion but we have implemented a
|
||
technique similar to the one described in Medeiros et al. <a class="footnote-reference brackets" href="#id11" id="id2">[2]</a> but using the
|
||
memoization cache instead of static variables. This approach is closer to the one
|
||
described in Warth et al. <a class="footnote-reference brackets" href="#id12" id="id3">[3]</a>. This allows us to write not only simple left-recursive
|
||
rules but also more complicated rules that involve indirect left-recursion like:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule1</span><span class="o">:</span> <span class="nc">rule2</span> <span class="o">|</span> <span class="s1">'a'</span>
|
||
<span class="nc">rule2</span><span class="o">:</span> <span class="nc">rule3</span> <span class="o">|</span> <span class="s1">'b'</span>
|
||
<span class="nc">rule3</span><span class="o">:</span> <span class="nc">rule1</span> <span class="o">|</span> <span class="s1">'c'</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>and “hidden left-recursion” like:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule</span><span class="o">:</span> <span class="s1">'optional'</span><span class="o">?</span> <span class="nc">rule</span> <span class="s1">'@'</span> <span class="nc">some_other_rule</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="syntax">
|
||
<h3><a class="toc-backref" href="#syntax" role="doc-backlink">Syntax</a></h3>
|
||
<p>The grammar consists of a sequence of rules of the form:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="nc">expression</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Optionally, a type can be included right after the rule name, which
|
||
specifies the return type of the C or Python function corresponding to
|
||
the rule:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="p">[</span><span class="s">return_type</span><span class="p">]</span><span class="o">:</span> <span class="nc">expression</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>If the return type is omitted, then a <code class="docutils literal notranslate"><span class="pre">void</span> <span class="pre">*</span></code> is returned in C and an
|
||
<code class="docutils literal notranslate"><span class="pre">Any</span></code> in Python.</p>
|
||
<section id="grammar-expressions">
|
||
<h4><a class="toc-backref" href="#grammar-expressions" role="doc-backlink">Grammar Expressions</a></h4>
|
||
<section id="comment">
|
||
<h5><a class="toc-backref" href="#comment" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">#</span> <span class="pre">comment</span></code></a></h5>
|
||
<p>Python-style comments.</p>
|
||
</section>
|
||
<section id="e1-e2">
|
||
<h5><a class="toc-backref" href="#e1-e2" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">e1</span> <span class="pre">e2</span></code></a></h5>
|
||
<p>Match e1, then match e2.</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="nc">first_rule</span> <span class="nc">second_rule</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="e1-e2-1">
|
||
<span id="id4"></span><h5><a class="toc-backref" href="#e1-e2-1" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">e1</span> <span class="pre">|</span> <span class="pre">e2</span></code></a></h5>
|
||
<p>Match e1 or e2.</p>
|
||
<p>The first alternative can also appear on the line after the rule name
|
||
for formatting purposes. In that case, a | must be used before the
|
||
first alternative, like so:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="p">[</span><span class="s">return_type</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">first_alt</span>
|
||
<span class="o">|</span> <span class="nc">second_alt</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="e">
|
||
<h5><a class="toc-backref" href="#e" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">(</span> <span class="pre">e</span> <span class="pre">)</span></code></a></h5>
|
||
<p>Match e.</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="p">(</span><span class="nc">e</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>A slightly more complex and useful example includes using the grouping
|
||
operator together with the repeat operators:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="p">(</span><span class="nc">e1</span> <span class="nc">e2</span><span class="p">)</span><span class="o">*</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="e-or-e">
|
||
<h5><a class="toc-backref" href="#e-or-e" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">[</span> <span class="pre">e</span> <span class="pre">]</span> <span class="pre">or</span> <span class="pre">e?</span></code></a></h5>
|
||
<p>Optionally match e.</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="p">[</span><span class="s">e</span><span class="p">]</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>A more useful example includes defining that a trailing comma is
|
||
optional:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="nc">e</span> <span class="p">(</span><span class="s1">','</span> <span class="nc">e</span><span class="p">)</span><span class="o">*</span> <span class="p">[</span><span class="s">','</span><span class="p">]</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="e-1">
|
||
<span id="id5"></span><h5><a class="toc-backref" href="#e-1" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">e*</span></code></a></h5>
|
||
<p>Match zero or more occurrences of e.</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="p">(</span><span class="nc">e1</span> <span class="nc">e2</span><span class="p">)</span><span class="o">*</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="e-2">
|
||
<span id="id6"></span><h5><a class="toc-backref" href="#e-2" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">e+</span></code></a></h5>
|
||
<p>Match one or more occurrences of e.</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="p">(</span><span class="nc">e1</span> <span class="nc">e2</span><span class="p">)</span><span class="o">+</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="s-e">
|
||
<h5><a class="toc-backref" href="#s-e" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">s.e+</span></code></a></h5>
|
||
<p>Match one or more occurrences of e, separated by s. The generated parse
|
||
tree does not include the separator. This is otherwise identical to
|
||
<code class="docutils literal notranslate"><span class="pre">(e</span> <span class="pre">(s</span> <span class="pre">e)*)</span></code>.</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="s1">','</span><span class="k">.</span><span class="nc">e</span><span class="o">+</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="e-3">
|
||
<span id="id7"></span><h5><a class="toc-backref" href="#e-3" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">&e</span></code></a></h5>
|
||
<p>Succeed if e can be parsed, without consuming any input.</p>
|
||
</section>
|
||
<section id="e-4">
|
||
<span id="id8"></span><h5><a class="toc-backref" href="#e-4" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">!e</span></code></a></h5>
|
||
<p>Fail if e can be parsed, without consuming any input.</p>
|
||
<p>An example taken from the proposed Python grammar specifies that a primary
|
||
consists of an atom, which is not followed by a <code class="docutils literal notranslate"><span class="pre">.</span></code> or a <code class="docutils literal notranslate"><span class="pre">(</span></code> or a
|
||
<code class="docutils literal notranslate"><span class="pre">[</span></code>:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">primary</span><span class="o">:</span> <span class="nc">atom</span> <span class="o">!</span><span class="s1">'.'</span> <span class="o">!</span><span class="s1">'('</span> <span class="o">!</span><span class="s1">'['</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="e-5">
|
||
<span id="id9"></span><h5><a class="toc-backref" href="#e-5" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">~</span></code></a></h5>
|
||
<p>Commit to the current alternative, even if it fails to parse.</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="s1">'('</span> <span class="o">~</span> <span class="nc">some_rule</span> <span class="s1">')'</span> <span class="o">|</span> <span class="nc">some_alt</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>In this example, if a left parenthesis is parsed, then the other
|
||
alternative won’t be considered, even if some_rule or ‘)’ fail to be
|
||
parsed.</p>
|
||
</section>
|
||
</section>
|
||
<section id="variables-in-the-grammar">
|
||
<h4><a class="toc-backref" href="#variables-in-the-grammar" role="doc-backlink">Variables in the Grammar</a></h4>
|
||
<p>A subexpression can be named by preceding it with an identifier and an
|
||
<code class="docutils literal notranslate"><span class="pre">=</span></code> sign. The name can then be used in the action (see below), like this:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="p">[</span><span class="s">return_type</span><span class="p">]</span><span class="o">:</span> <span class="s1">'('</span> <span class="nc">a</span><span class="o">=</span><span class="nc">some_other_rule</span> <span class="s1">')'</span> <span class="nc">{</span> <span class="nc">a</span> <span class="nc">}</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="grammar-actions">
|
||
<h3><a class="toc-backref" href="#grammar-actions" role="doc-backlink">Grammar actions</a></h3>
|
||
<p>To avoid the intermediate steps that obscure the relationship between the
|
||
grammar and the AST generation the proposed PEG parser allows directly
|
||
generating AST nodes for a rule via grammar actions. Grammar actions are
|
||
language-specific expressions that are evaluated when a grammar rule is
|
||
successfully parsed. These expressions can be written in Python or C
|
||
depending on the desired output of the parser generator. This means that if
|
||
one would want to generate a parser in Python and another in C, two grammar
|
||
files should be written, each one with a different set of actions, keeping
|
||
everything else apart from said actions identical in both files. As an
|
||
example of a grammar with Python actions, the piece of the parser generator
|
||
that parses grammar files is bootstrapped from a meta-grammar file with
|
||
Python actions that generate the grammar tree as a result of the parsing.</p>
|
||
<p>In the specific case of the new proposed PEG grammar for Python, having
|
||
actions allows directly describing how the AST is composed in the grammar
|
||
itself, making it more clear and maintainable. This AST generation process is
|
||
supported by the use of some helper functions that factor out common AST
|
||
object manipulations and some other required operations that are not directly
|
||
related to the grammar.</p>
|
||
<p>To indicate these actions each alternative can be followed by the action code
|
||
inside curly-braces, which specifies the return value of the alternative:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="p">[</span><span class="s">return_type</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">first_alt1</span> <span class="nc">first_alt2</span> <span class="nc">{</span> <span class="nc">first_alt1</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">second_alt1</span> <span class="nc">second_alt2</span> <span class="nc">{</span> <span class="nc">second_alt1</span> <span class="nc">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>If the action is omitted and C code is being generated, then there are two
|
||
different possibilities:</p>
|
||
<ol class="arabic simple">
|
||
<li>If there’s a single name in the alternative, this gets returned.</li>
|
||
<li>If not, a dummy name object gets returned (this case should be avoided).</li>
|
||
</ol>
|
||
<p>If the action is omitted and Python code is being generated, then a list
|
||
with all the parsed expressions gets returned (this is meant for debugging).</p>
|
||
<p>The full meta-grammar for the grammars supported by the PEG generator is:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">start</span><span class="p">[</span><span class="s">Grammar</span><span class="p">]</span><span class="o">:</span> <span class="nc">grammar</span> <span class="nc">ENDMARKER</span> <span class="nc">{</span> <span class="nc">grammar</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">grammar</span><span class="p">[</span><span class="s">Grammar</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">metas</span> <span class="nc">rules</span> <span class="nc">{</span> <span class="nc">Grammar</span><span class="p">(</span><span class="nc">rules,</span> <span class="nc">metas</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">rules</span> <span class="nc">{</span> <span class="nc">Grammar</span><span class="p">(</span><span class="nc">rules,</span> <span class="p">[])</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">metas</span><span class="p">[</span><span class="s">MetaList</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">meta</span> <span class="nc">metas</span> <span class="nc">{</span> <span class="p">[</span><span class="s">meta</span><span class="p">]</span> <span class="o">+</span> <span class="nc">metas</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">meta</span> <span class="nc">{</span> <span class="p">[</span><span class="s">meta</span><span class="p">]</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">meta</span><span class="p">[</span><span class="s">MetaTuple</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="s2">"@"</span> <span class="nc">NAME</span> <span class="nc">NEWLINE</span> <span class="nc">{</span> <span class="p">(</span><span class="nc">name.string,</span> <span class="nc">None</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="s2">"@"</span> <span class="nc">a</span><span class="o">=</span><span class="nc">NAME</span> <span class="nc">b</span><span class="o">=</span><span class="nc">NAME</span> <span class="nc">NEWLINE</span> <span class="nc">{</span> <span class="p">(</span><span class="nc">a.string,</span> <span class="nc">b.string</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="s2">"@"</span> <span class="nc">NAME</span> <span class="nc">STRING</span> <span class="nc">NEWLINE</span> <span class="nc">{</span> <span class="p">(</span><span class="nc">name.string,</span> <span class="nc">literal_eval</span><span class="p">(</span><span class="nc">string.string</span><span class="p">))</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">rules</span><span class="p">[</span><span class="s">RuleList</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">rule</span> <span class="nc">rules</span> <span class="nc">{</span> <span class="p">[</span><span class="s">rule</span><span class="p">]</span> <span class="o">+</span> <span class="nc">rules</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">rule</span> <span class="nc">{</span> <span class="p">[</span><span class="s">rule</span><span class="p">]</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">rule</span><span class="p">[</span><span class="s">Rule</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">rulename</span> <span class="s2">":"</span> <span class="nc">alts</span> <span class="nc">NEWLINE</span> <span class="nc">INDENT</span> <span class="nc">more_alts</span> <span class="nc">DEDENT</span> <span class="nc">{</span>
|
||
<span class="nc">Rule</span><span class="p">(</span><span class="nc">rulename</span><span class="p">[</span><span class="s">0</span><span class="p">]</span><span class="nc">,</span> <span class="nc">rulename</span><span class="p">[</span><span class="s">1</span><span class="p">]</span><span class="nc">,</span> <span class="nc">Rhs</span><span class="p">(</span><span class="nc">alts.alts</span> <span class="o">+</span> <span class="nc">more_alts.alts</span><span class="p">))</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">rulename</span> <span class="s2">":"</span> <span class="nc">NEWLINE</span> <span class="nc">INDENT</span> <span class="nc">more_alts</span> <span class="nc">DEDENT</span> <span class="nc">{</span> <span class="nc">Rule</span><span class="p">(</span><span class="nc">rulename</span><span class="p">[</span><span class="s">0</span><span class="p">]</span><span class="nc">,</span> <span class="nc">rulename</span><span class="p">[</span><span class="s">1</span><span class="p">]</span><span class="nc">,</span> <span class="nc">more_alts</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">rulename</span> <span class="s2">":"</span> <span class="nc">alts</span> <span class="nc">NEWLINE</span> <span class="nc">{</span> <span class="nc">Rule</span><span class="p">(</span><span class="nc">rulename</span><span class="p">[</span><span class="s">0</span><span class="p">]</span><span class="nc">,</span> <span class="nc">rulename</span><span class="p">[</span><span class="s">1</span><span class="p">]</span><span class="nc">,</span> <span class="nc">alts</span><span class="p">)</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">rulename</span><span class="p">[</span><span class="s">RuleName</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">NAME</span> <span class="s1">'['</span> <span class="nc">type</span><span class="o">=</span><span class="nc">NAME</span> <span class="s1">'*'</span> <span class="s1">']'</span> <span class="nc">{</span><span class="p">(</span><span class="nc">name.string,</span> <span class="nc">type.string</span><span class="o">+</span><span class="s2">"*"</span><span class="p">)</span><span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">NAME</span> <span class="s1">'['</span> <span class="nc">type</span><span class="o">=</span><span class="nc">NAME</span> <span class="s1">']'</span> <span class="nc">{</span><span class="p">(</span><span class="nc">name.string,</span> <span class="nc">type.string</span><span class="p">)</span><span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">NAME</span> <span class="nc">{</span><span class="p">(</span><span class="nc">name.string,</span> <span class="nc">None</span><span class="p">)</span><span class="nc">}</span>
|
||
|
||
<span class="nc">alts</span><span class="p">[</span><span class="s">Rhs</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">alt</span> <span class="s2">"|"</span> <span class="nc">alts</span> <span class="nc">{</span> <span class="nc">Rhs</span><span class="p">([</span><span class="s">alt</span><span class="p">]</span> <span class="o">+</span> <span class="nc">alts.alts</span><span class="p">)</span><span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">alt</span> <span class="nc">{</span> <span class="nc">Rhs</span><span class="p">([</span><span class="s">alt</span><span class="p">])</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">more_alts</span><span class="p">[</span><span class="s">Rhs</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="s2">"|"</span> <span class="nc">alts</span> <span class="nc">NEWLINE</span> <span class="nc">more_alts</span> <span class="nc">{</span> <span class="nc">Rhs</span><span class="p">(</span><span class="nc">alts.alts</span> <span class="o">+</span> <span class="nc">more_alts.alts</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="s2">"|"</span> <span class="nc">alts</span> <span class="nc">NEWLINE</span> <span class="nc">{</span> <span class="nc">Rhs</span><span class="p">(</span><span class="nc">alts.alts</span><span class="p">)</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">alt</span><span class="p">[</span><span class="s">Alt</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">items</span> <span class="s1">'$'</span> <span class="nc">action</span> <span class="nc">{</span> <span class="nc">Alt</span><span class="p">(</span><span class="nc">items</span> <span class="o">+</span> <span class="p">[</span><span class="s">NamedItem(None, NameLeaf('ENDMARKER'))</span><span class="p">]</span><span class="nc">,</span> <span class="nc">action</span><span class="o">=</span><span class="nc">action</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">items</span> <span class="s1">'$'</span> <span class="nc">{</span> <span class="nc">Alt</span><span class="p">(</span><span class="nc">items</span> <span class="o">+</span> <span class="p">[</span><span class="s">NamedItem(None, NameLeaf('ENDMARKER'))</span><span class="p">]</span><span class="nc">,</span> <span class="nc">action</span><span class="o">=</span><span class="nc">None</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">items</span> <span class="nc">action</span> <span class="nc">{</span> <span class="nc">Alt</span><span class="p">(</span><span class="nc">items,</span> <span class="nc">action</span><span class="o">=</span><span class="nc">action</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">items</span> <span class="nc">{</span> <span class="nc">Alt</span><span class="p">(</span><span class="nc">items,</span> <span class="nc">action</span><span class="o">=</span><span class="nc">None</span><span class="p">)</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">items</span><span class="p">[</span><span class="s">NamedItemList</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">named_item</span> <span class="nc">items</span> <span class="nc">{</span> <span class="p">[</span><span class="s">named_item</span><span class="p">]</span> <span class="o">+</span> <span class="nc">items</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">named_item</span> <span class="nc">{</span> <span class="p">[</span><span class="s">named_item</span><span class="p">]</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">named_item</span><span class="p">[</span><span class="s">NamedItem</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">NAME</span> <span class="s1">'='</span> <span class="o">~</span> <span class="nc">item</span> <span class="nc">{NamedItem</span><span class="p">(</span><span class="nc">name.string,</span> <span class="nc">item</span><span class="p">)</span><span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">item</span> <span class="nc">{NamedItem</span><span class="p">(</span><span class="nc">None,</span> <span class="nc">item</span><span class="p">)</span><span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">it</span><span class="o">=</span><span class="nc">lookahead</span> <span class="nc">{NamedItem</span><span class="p">(</span><span class="nc">None,</span> <span class="nc">it</span><span class="p">)</span><span class="nc">}</span>
|
||
|
||
<span class="nc">lookahead</span><span class="p">[</span><span class="s">LookaheadOrCut</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="s1">'&'</span> <span class="o">~</span> <span class="nc">atom</span> <span class="nc">{PositiveLookahead</span><span class="p">(</span><span class="nc">atom</span><span class="p">)</span><span class="nc">}</span>
|
||
<span class="o">|</span> <span class="s1">'!'</span> <span class="o">~</span> <span class="nc">atom</span> <span class="nc">{NegativeLookahead</span><span class="p">(</span><span class="nc">atom</span><span class="p">)</span><span class="nc">}</span>
|
||
<span class="o">|</span> <span class="s1">'~'</span> <span class="nc">{Cut</span><span class="p">()</span><span class="nc">}</span>
|
||
|
||
<span class="nc">item</span><span class="p">[</span><span class="s">Item</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="s1">'['</span> <span class="o">~</span> <span class="nc">alts</span> <span class="s1">']'</span> <span class="nc">{Opt</span><span class="p">(</span><span class="nc">alts</span><span class="p">)</span><span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">atom</span> <span class="s1">'?'</span> <span class="nc">{Opt</span><span class="p">(</span><span class="nc">atom</span><span class="p">)</span><span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">atom</span> <span class="s1">'*'</span> <span class="nc">{Repeat0</span><span class="p">(</span><span class="nc">atom</span><span class="p">)</span><span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">atom</span> <span class="s1">'+'</span> <span class="nc">{Repeat1</span><span class="p">(</span><span class="nc">atom</span><span class="p">)</span><span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">sep</span><span class="o">=</span><span class="nc">atom</span> <span class="s1">'.'</span> <span class="nc">node</span><span class="o">=</span><span class="nc">atom</span> <span class="s1">'+'</span> <span class="nc">{Gather</span><span class="p">(</span><span class="nc">sep,</span> <span class="nc">node</span><span class="p">)</span><span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">atom</span> <span class="nc">{atom}</span>
|
||
|
||
<span class="nc">atom</span><span class="p">[</span><span class="s">Plain</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="s1">'('</span> <span class="o">~</span> <span class="nc">alts</span> <span class="s1">')'</span> <span class="nc">{Group</span><span class="p">(</span><span class="nc">alts</span><span class="p">)</span><span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">NAME</span> <span class="nc">{NameLeaf</span><span class="p">(</span><span class="nc">name.string</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">STRING</span> <span class="nc">{StringLeaf</span><span class="p">(</span><span class="nc">string.string</span><span class="p">)</span><span class="nc">}</span>
|
||
|
||
<span class="c1"># Mini-grammar for the actions</span>
|
||
|
||
<span class="nc">action</span><span class="p">[</span><span class="s">str</span><span class="p">]</span><span class="o">:</span> <span class="s2">"{"</span> <span class="o">~</span> <span class="nc">target_atoms</span> <span class="s2">"}"</span> <span class="nc">{</span> <span class="nc">target_atoms</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">target_atoms</span><span class="p">[</span><span class="s">str</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">target_atom</span> <span class="nc">target_atoms</span> <span class="nc">{</span> <span class="nc">target_atom</span> <span class="o">+</span> <span class="s2">" "</span> <span class="o">+</span> <span class="nc">target_atoms</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">target_atom</span> <span class="nc">{</span> <span class="nc">target_atom</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">target_atom</span><span class="p">[</span><span class="s">str</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="s2">"{"</span> <span class="o">~</span> <span class="nc">target_atoms</span> <span class="s2">"}"</span> <span class="nc">{</span> <span class="s2">"{"</span> <span class="o">+</span> <span class="nc">target_atoms</span> <span class="o">+</span> <span class="s2">"}"</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">NAME</span> <span class="nc">{</span> <span class="nc">name.string</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">NUMBER</span> <span class="nc">{</span> <span class="nc">number.string</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">STRING</span> <span class="nc">{</span> <span class="nc">string.string</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="s2">"?"</span> <span class="nc">{</span> <span class="s2">"?"</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="s2">":"</span> <span class="nc">{</span> <span class="s2">":"</span> <span class="nc">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>As an illustrative example this simple grammar file allows directly
|
||
generating a full parser that can parse simple arithmetic expressions and that
|
||
returns a valid C-based Python AST:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">start</span><span class="p">[</span><span class="s">mod_ty</span><span class="p">]</span><span class="o">:</span> <span class="nc">a</span><span class="o">=</span><span class="nc">expr_stmt</span><span class="o">*</span> <span class="nc">$</span> <span class="nc">{</span> <span class="nc">Module</span><span class="p">(</span><span class="nc">a,</span> <span class="nc">NULL,</span> <span class="nc">p->arena</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="nc">expr_stmt</span><span class="p">[</span><span class="s">stmt_ty</span><span class="p">]</span><span class="o">:</span> <span class="nc">a</span><span class="o">=</span><span class="nc">expr</span> <span class="nc">NEWLINE</span> <span class="nc">{</span> <span class="nc">_Py_Expr</span><span class="p">(</span><span class="nc">a,</span> <span class="nc">EXTRA</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="nc">expr</span><span class="p">[</span><span class="s">expr_ty</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">l</span><span class="o">=</span><span class="nc">expr</span> <span class="s1">'+'</span> <span class="nc">r</span><span class="o">=</span><span class="nc">term</span> <span class="nc">{</span> <span class="nc">_Py_BinOp</span><span class="p">(</span><span class="nc">l,</span> <span class="nc">Add,</span> <span class="nc">r,</span> <span class="nc">EXTRA</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">l</span><span class="o">=</span><span class="nc">expr</span> <span class="s1">'-'</span> <span class="nc">r</span><span class="o">=</span><span class="nc">term</span> <span class="nc">{</span> <span class="nc">_Py_BinOp</span><span class="p">(</span><span class="nc">l,</span> <span class="nc">Sub,</span> <span class="nc">r,</span> <span class="nc">EXTRA</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">t</span><span class="o">=</span><span class="nc">term</span> <span class="nc">{</span> <span class="nc">t</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">term</span><span class="p">[</span><span class="s">expr_ty</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">l</span><span class="o">=</span><span class="nc">term</span> <span class="s1">'*'</span> <span class="nc">r</span><span class="o">=</span><span class="nc">factor</span> <span class="nc">{</span> <span class="nc">_Py_BinOp</span><span class="p">(</span><span class="nc">l,</span> <span class="nc">Mult,</span> <span class="nc">r,</span> <span class="nc">EXTRA</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">l</span><span class="o">=</span><span class="nc">term</span> <span class="s1">'/'</span> <span class="nc">r</span><span class="o">=</span><span class="nc">factor</span> <span class="nc">{</span> <span class="nc">_Py_BinOp</span><span class="p">(</span><span class="nc">l,</span> <span class="nc">Div,</span> <span class="nc">r,</span> <span class="nc">EXTRA</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">f</span><span class="o">=</span><span class="nc">factor</span> <span class="nc">{</span> <span class="nc">f</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">factor</span><span class="p">[</span><span class="s">expr_ty</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="s1">'('</span> <span class="nc">e</span><span class="o">=</span><span class="nc">expr</span> <span class="s1">')'</span> <span class="nc">{</span> <span class="nc">e</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">a</span><span class="o">=</span><span class="nc">atom</span> <span class="nc">{</span> <span class="nc">a</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">atom</span><span class="p">[</span><span class="s">expr_ty</span><span class="p">]</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">n</span><span class="o">=</span><span class="nc">NAME</span> <span class="nc">{</span> <span class="nc">n</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">n</span><span class="o">=</span><span class="nc">NUMBER</span> <span class="nc">{</span> <span class="nc">n</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">s</span><span class="o">=</span><span class="nc">STRING</span> <span class="nc">{</span> <span class="nc">s</span> <span class="nc">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Here <code class="docutils literal notranslate"><span class="pre">EXTRA</span></code> is a macro that expands to <code class="docutils literal notranslate"><span class="pre">start_lineno,</span> <span class="pre">start_col_offset,</span>
|
||
<span class="pre">end_lineno,</span> <span class="pre">end_col_offset,</span> <span class="pre">p->arena</span></code>, those being variables automatically
|
||
injected by the parser; <code class="docutils literal notranslate"><span class="pre">p</span></code> points to an object that holds on to all state
|
||
for the parser.</p>
|
||
<p>A similar grammar written to target Python AST objects:</p>
|
||
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">start</span><span class="o">:</span> <span class="nc">expr</span> <span class="nc">NEWLINE</span><span class="o">?</span> <span class="nc">ENDMARKER</span> <span class="nc">{</span> <span class="nc">ast.Expression</span><span class="p">(</span><span class="nc">expr</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="nc">expr</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">expr</span> <span class="s1">'+'</span> <span class="nc">term</span> <span class="nc">{</span> <span class="nc">ast.BinOp</span><span class="p">(</span><span class="nc">expr,</span> <span class="nc">ast.Add</span><span class="p">()</span><span class="nc">,</span> <span class="nc">term</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">expr</span> <span class="s1">'-'</span> <span class="nc">term</span> <span class="nc">{</span> <span class="nc">ast.BinOp</span><span class="p">(</span><span class="nc">expr,</span> <span class="nc">ast.Sub</span><span class="p">()</span><span class="nc">,</span> <span class="nc">term</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">term</span> <span class="nc">{</span> <span class="nc">term</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">term</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">l</span><span class="o">=</span><span class="nc">term</span> <span class="s1">'*'</span> <span class="nc">r</span><span class="o">=</span><span class="nc">factor</span> <span class="nc">{</span> <span class="nc">ast.BinOp</span><span class="p">(</span><span class="nc">l,</span> <span class="nc">ast.Mult</span><span class="p">()</span><span class="nc">,</span> <span class="nc">r</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">term</span> <span class="s1">'/'</span> <span class="nc">factor</span> <span class="nc">{</span> <span class="nc">ast.BinOp</span><span class="p">(</span><span class="nc">term,</span> <span class="nc">ast.Div</span><span class="p">()</span><span class="nc">,</span> <span class="nc">factor</span><span class="p">)</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">factor</span> <span class="nc">{</span> <span class="nc">factor</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">factor</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="s1">'('</span> <span class="nc">expr</span> <span class="s1">')'</span> <span class="nc">{</span> <span class="nc">expr</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">atom</span> <span class="nc">{</span> <span class="nc">atom</span> <span class="nc">}</span>
|
||
|
||
<span class="nc">atom</span><span class="o">:</span>
|
||
<span class="o">|</span> <span class="nc">NAME</span> <span class="nc">{</span> <span class="nc">ast.Name</span><span class="p">(</span><span class="nc">id</span><span class="o">=</span><span class="nc">name.string,</span> <span class="nc">ctx</span><span class="o">=</span><span class="nc">ast.Load</span><span class="p">())</span> <span class="nc">}</span>
|
||
<span class="o">|</span> <span class="nc">NUMBER</span> <span class="nc">{</span> <span class="nc">ast.Constant</span><span class="p">(</span><span class="nc">value</span><span class="o">=</span><span class="nc">ast.literal_eval</span><span class="p">(</span><span class="nc">number.string</span><span class="p">))</span> <span class="nc">}</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="migration-plan">
|
||
<h2><a class="toc-backref" href="#migration-plan" role="doc-backlink">Migration plan</a></h2>
|
||
<p>This section describes the migration plan when porting to the new PEG-based parser
|
||
if this PEP is accepted. The migration will be executed in a series of steps that allow
|
||
initially to fallback to the previous parser if needed:</p>
|
||
<ol class="arabic simple">
|
||
<li>Starting with Python 3.9 alpha 6, include the new PEG-based parser machinery in CPython
|
||
with a command-line flag and environment variable that allows switching between
|
||
the new and the old parsers together with explicit APIs that allow invoking the
|
||
new and the old parsers independently. At this step, all Python APIs like <code class="docutils literal notranslate"><span class="pre">ast.parse</span></code>
|
||
and <code class="docutils literal notranslate"><span class="pre">compile</span></code> will use the parser set by the flags or the environment variable and
|
||
the default parser will be the new PEG-based parser.</li>
|
||
<li>Between Python 3.9 and Python 3.10, the old parser and related code (like the
|
||
“parser” module) will be kept until a new Python release happens (Python 3.10). In
|
||
the meanwhile and until the old parser is removed, <strong>no new Python Grammar
|
||
addition will be added that requires the PEG parser</strong>. This means that the grammar
|
||
will be kept LL(1) until the old parser is removed.</li>
|
||
<li>In Python 3.10, remove the old parser, the command-line flag, the environment
|
||
variable and the “parser” module and related code.</li>
|
||
</ol>
|
||
</section>
|
||
<section id="performance-and-validation">
|
||
<h2><a class="toc-backref" href="#performance-and-validation" role="doc-backlink">Performance and validation</a></h2>
|
||
<p>We have done extensive timing and validation of the new parser, and
|
||
this gives us confidence that the new parser is of high enough quality
|
||
to replace the current parser.</p>
|
||
<section id="validation">
|
||
<h3><a class="toc-backref" href="#validation" role="doc-backlink">Validation</a></h3>
|
||
<p>To start with validation, we regularly compile the entire Python 3.8
|
||
stdlib and compare every aspect of the resulting AST with that
|
||
produced by the standard compiler. (In the process we found a few bugs
|
||
in the standard parser’s treatment of line and column numbers, which
|
||
we have all fixed upstream via a series of issues and PRs.)</p>
|
||
<p>We have also occasionally compiled a much larger codebase (the approx.
|
||
3800 most popular packages on PyPI) and this has helped us find a (very)
|
||
few additional bugs in the new parser.</p>
|
||
<p>(One area we have not explored extensively is rejection of all wrong
|
||
programs. We have unit tests that check for a certain number of
|
||
explicit rejections, but more work could be done, e.g. by using a
|
||
fuzzer that inserts random subtle bugs into existing code. We’re open
|
||
to help in this area.)</p>
|
||
</section>
|
||
<section id="performance">
|
||
<h3><a class="toc-backref" href="#performance" role="doc-backlink">Performance</a></h3>
|
||
<p>We have tuned the performance of the new parser to come within 10% of
|
||
the current parser both in speed and memory consumption. While the
|
||
PEG/packrat parsing algorithm inherently consumes more memory than the
|
||
current LL(1) parser, we have an advantage because we don’t construct
|
||
an intermediate CST.</p>
|
||
<p>Below are some benchmarks. These are focused on compiling source code
|
||
to bytecode, because this is the most realistic situation. Returning
|
||
an AST to Python code is not as representative, because the process to
|
||
convert the <em>internal</em> AST (only accessible to C code) to an
|
||
<em>external</em> AST (an instance of <code class="docutils literal notranslate"><span class="pre">ast.AST</span></code>) takes more time than the
|
||
parser itself.</p>
|
||
<p>All measurements reported here are done on a recent MacBook Pro,
|
||
taking the median of three runs. No particular care was taken to stop
|
||
other applications running on the same machine.</p>
|
||
<p>The first timings are for our canonical test file, which has 100,000
|
||
lines endlessly repeating the following three lines:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="mi">1</span> <span class="o">+</span> <span class="mi">2</span> <span class="o">+</span> <span class="mi">4</span> <span class="o">+</span> <span class="mi">5</span> <span class="o">+</span> <span class="mi">6</span> <span class="o">+</span> <span class="mi">7</span> <span class="o">+</span> <span class="mi">8</span> <span class="o">+</span> <span class="mi">9</span> <span class="o">+</span> <span class="mi">10</span> <span class="o">+</span> <span class="p">((((((</span><span class="mi">11</span> <span class="o">*</span> <span class="mi">12</span> <span class="o">*</span> <span class="mi">13</span> <span class="o">*</span> <span class="mi">14</span> <span class="o">*</span> <span class="mi">15</span> <span class="o">+</span> <span class="mi">16</span> <span class="o">*</span> <span class="mi">17</span> <span class="o">+</span> <span class="mi">18</span> <span class="o">*</span> <span class="mi">19</span> <span class="o">*</span> <span class="mi">20</span><span class="p">))))))</span>
|
||
<span class="mi">2</span><span class="o">*</span><span class="mi">3</span> <span class="o">+</span> <span class="mi">4</span><span class="o">*</span><span class="mi">5</span><span class="o">*</span><span class="mi">6</span>
|
||
<span class="mi">12</span> <span class="o">+</span> <span class="p">(</span><span class="mi">2</span> <span class="o">*</span> <span class="mi">3</span> <span class="o">*</span> <span class="mi">4</span> <span class="o">*</span> <span class="mi">5</span> <span class="o">+</span> <span class="mi">6</span> <span class="o">+</span> <span class="mi">7</span> <span class="o">*</span> <span class="mi">8</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<ul class="simple">
|
||
<li>Just parsing and throwing away the internal AST takes 1.16 seconds
|
||
with a max RSS of 681 MiB.</li>
|
||
<li>Parsing and converting to <code class="docutils literal notranslate"><span class="pre">ast.AST</span></code> takes 6.34 seconds, max RSS
|
||
1029 MiB.</li>
|
||
<li>Parsing and compiling to bytecode takes 1.28 seconds, max RSS 681
|
||
MiB.</li>
|
||
<li>With the current parser, parsing and compiling takes 1.44 seconds,
|
||
max RSS 836 MiB.</li>
|
||
</ul>
|
||
<p>For this particular test file, the new parser is faster and uses less
|
||
memory than the current parser (compare the last two bullets).</p>
|
||
<p>We also did timings with a more realistic payload, the entire Python
|
||
3.8 stdlib. This payload consists of 1,641 files, 749,570 lines,
|
||
27,622,497 bytes. (Though 11 files can’t be compiled by any Python 3
|
||
parser due to encoding issues, sometimes intentional.)</p>
|
||
<ul class="simple">
|
||
<li>Compiling and throwing away the internal AST took 2.141 seconds.
|
||
That’s 350,040 lines/sec, or 12,899,367 bytes/sec. The max RSS was
|
||
74 MiB (the largest file in the stdlib is much smaller than our
|
||
canonical test file).</li>
|
||
<li>Compiling to bytecode took 3.290 seconds. That’s 227,861 lines/sec,
|
||
or 8,396,942 bytes/sec. Max RSS 77 MiB.</li>
|
||
<li>Compiling to bytecode using the current parser took 3.367 seconds.
|
||
That’s 222,620 lines/sec, or 8,203,780 bytes/sec. Max RSS 70 MiB.</li>
|
||
</ul>
|
||
<p>Comparing the last two bullets we find that the new parser is slightly
|
||
faster but uses slightly (about 10%) more memory. We believe this is
|
||
acceptable. (Also, there are probably some more tweaks we can make to
|
||
reduce memory usage.)</p>
|
||
</section>
|
||
</section>
|
||
<section id="rejected-alternatives">
|
||
<h2><a class="toc-backref" href="#rejected-alternatives" role="doc-backlink">Rejected Alternatives</a></h2>
|
||
<p>We did not seriously consider alternative ways to implement the new
|
||
parser, but here’s a brief discussion of LALR(1).</p>
|
||
<p>Thirty years ago the first author decided to go his own way with
|
||
Python’s parser rather than using LALR(1), which was the industry
|
||
standard at the time (e.g. Bison and Yacc). The reasons were
|
||
primarily emotional (gut feelings, intuition), based on past experience
|
||
using Yacc in other projects, where grammar development took more
|
||
effort than anticipated (in part due to shift-reduce conflicts). A
|
||
specific criticism of Bison and Yacc that still holds is that their
|
||
meta-grammar (the notation used to feed the grammar into the parser
|
||
generator) does not support EBNF conveniences like
|
||
<code class="docutils literal notranslate"><span class="pre">[optional_clause]</span></code> or <code class="docutils literal notranslate"><span class="pre">(repeated_clause)*</span></code>. Using a custom
|
||
parser generator, a syntax tree matching the structure of the grammar
|
||
could be generated automatically, and with EBNF that tree could match
|
||
the “human-friendly” structure of the grammar.</p>
|
||
<p>Other variants of LR were not considered, nor was LL (e.g. ANTLR).
|
||
PEG was selected because it was easy to understand given a basic
|
||
understanding of recursive-descent parsing.</p>
|
||
</section>
|
||
<section id="references">
|
||
<h2><a class="toc-backref" href="#references" role="doc-backlink">References</a></h2>
|
||
<aside class="footnote-list brackets">
|
||
<aside class="footnote brackets" id="id10" role="doc-footnote">
|
||
<dt class="label" id="id10">[<a href="#id1">1</a>]</dt>
|
||
<dd>Ford, Bryan
|
||
<a class="reference external" href="https://pdos.csail.mit.edu/~baford/packrat/thesis/">https://pdos.csail.mit.edu/~baford/packrat/thesis/</a></aside>
|
||
<aside class="footnote brackets" id="id11" role="doc-footnote">
|
||
<dt class="label" id="id11">[<a href="#id2">2</a>]</dt>
|
||
<dd>Medeiros et al.
|
||
<a class="reference external" href="https://arxiv.org/pdf/1207.0443.pdf">https://arxiv.org/pdf/1207.0443.pdf</a></aside>
|
||
<aside class="footnote brackets" id="id12" role="doc-footnote">
|
||
<dt class="label" id="id12">[<a href="#id3">3</a>]</dt>
|
||
<dd>Warth et al.
|
||
<a class="reference external" href="https://web.cs.ucla.edu/~todd/research/pepm08.pdf">https://web.cs.ucla.edu/~todd/research/pepm08.pdf</a></aside>
|
||
</aside>
|
||
<p>[4] Guido’s series on PEG parsing
|
||
<a class="reference external" href="https://medium.com/@gvanrossum_83706/peg-parsing-series-de5d41b2ed60">https://medium.com/@gvanrossum_83706/peg-parsing-series-de5d41b2ed60</a></p>
|
||
</section>
|
||
<section id="copyright">
|
||
<h2><a class="toc-backref" href="#copyright" role="doc-backlink">Copyright</a></h2>
|
||
<p>This document has been placed in the public domain.</p>
|
||
</section>
|
||
</section>
|
||
<hr class="docutils" />
|
||
<p>Source: <a class="reference external" href="https://github.com/python/peps/blob/main/peps/pep-0617.rst">https://github.com/python/peps/blob/main/peps/pep-0617.rst</a></p>
|
||
<p>Last modified: <a class="reference external" href="https://github.com/python/peps/commits/main/peps/pep-0617.rst">2023-12-18 21:39:13 GMT</a></p>
|
||
|
||
</article>
|
||
<nav id="pep-sidebar">
|
||
<h2>Contents</h2>
|
||
<ul>
|
||
<li><a class="reference internal" href="#overview">Overview</a></li>
|
||
<li><a class="reference internal" href="#background-on-ll-1-parsers">Background on LL(1) parsers</a></li>
|
||
<li><a class="reference internal" href="#background-on-peg-parsers">Background on PEG parsers</a></li>
|
||
<li><a class="reference internal" href="#rationale">Rationale</a><ul>
|
||
<li><a class="reference internal" href="#some-rules-are-not-actually-ll-1">Some rules are not actually LL(1)</a></li>
|
||
<li><a class="reference internal" href="#complicated-ast-parsing">Complicated AST parsing</a></li>
|
||
<li><a class="reference internal" href="#lack-of-left-recursion">Lack of left recursion</a></li>
|
||
<li><a class="reference internal" href="#intermediate-parse-tree">Intermediate parse tree</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a class="reference internal" href="#the-new-proposed-peg-parser">The new proposed PEG parser</a><ul>
|
||
<li><a class="reference internal" href="#left-recursion">Left recursion</a></li>
|
||
<li><a class="reference internal" href="#syntax">Syntax</a><ul>
|
||
<li><a class="reference internal" href="#grammar-expressions">Grammar Expressions</a><ul>
|
||
<li><a class="reference internal" href="#comment"><code class="docutils literal notranslate"><span class="pre">#</span> <span class="pre">comment</span></code></a></li>
|
||
<li><a class="reference internal" href="#e1-e2"><code class="docutils literal notranslate"><span class="pre">e1</span> <span class="pre">e2</span></code></a></li>
|
||
<li><a class="reference internal" href="#e1-e2-1"><code class="docutils literal notranslate"><span class="pre">e1</span> <span class="pre">|</span> <span class="pre">e2</span></code></a></li>
|
||
<li><a class="reference internal" href="#e"><code class="docutils literal notranslate"><span class="pre">(</span> <span class="pre">e</span> <span class="pre">)</span></code></a></li>
|
||
<li><a class="reference internal" href="#e-or-e"><code class="docutils literal notranslate"><span class="pre">[</span> <span class="pre">e</span> <span class="pre">]</span> <span class="pre">or</span> <span class="pre">e?</span></code></a></li>
|
||
<li><a class="reference internal" href="#e-1"><code class="docutils literal notranslate"><span class="pre">e*</span></code></a></li>
|
||
<li><a class="reference internal" href="#e-2"><code class="docutils literal notranslate"><span class="pre">e+</span></code></a></li>
|
||
<li><a class="reference internal" href="#s-e"><code class="docutils literal notranslate"><span class="pre">s.e+</span></code></a></li>
|
||
<li><a class="reference internal" href="#e-3"><code class="docutils literal notranslate"><span class="pre">&e</span></code></a></li>
|
||
<li><a class="reference internal" href="#e-4"><code class="docutils literal notranslate"><span class="pre">!e</span></code></a></li>
|
||
<li><a class="reference internal" href="#e-5"><code class="docutils literal notranslate"><span class="pre">~</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a class="reference internal" href="#variables-in-the-grammar">Variables in the Grammar</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a class="reference internal" href="#grammar-actions">Grammar actions</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a class="reference internal" href="#migration-plan">Migration plan</a></li>
|
||
<li><a class="reference internal" href="#performance-and-validation">Performance and validation</a><ul>
|
||
<li><a class="reference internal" href="#validation">Validation</a></li>
|
||
<li><a class="reference internal" href="#performance">Performance</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a class="reference internal" href="#rejected-alternatives">Rejected Alternatives</a></li>
|
||
<li><a class="reference internal" href="#references">References</a></li>
|
||
<li><a class="reference internal" href="#copyright">Copyright</a></li>
|
||
</ul>
|
||
|
||
<br>
|
||
<a id="source" href="https://github.com/python/peps/blob/main/peps/pep-0617.rst">Page Source (GitHub)</a>
|
||
</nav>
|
||
</section>
|
||
<script src="../_static/colour_scheme.js"></script>
|
||
<script src="../_static/wrap_tables.js"></script>
|
||
<script src="../_static/sticky_banner.js"></script>
|
||
</body>
|
||
</html> |