Generate ``peps.rss`` via Sphinx (#3274)

2023-08-05 17:22:22 +01:00 · 2023-08-05 17:22:22 +01:00 · 2b53c224d1
parent 7b86f6deb0
commit 2b53c224d1
6 changed files with 135 additions and 224 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -16,7 +16,6 @@ AUTHOR_OVERRIDES.csv    @AA-Turner
 build.py                @AA-Turner
 conf.py                 @AA-Turner
 contents.rst            @AA-Turner
-generate_rss.py         @AA-Turner

 # Linting infrastructure
 .codespell/             @CAM-Gerlach @hugovk
--- a/7
+++ b/7
@ -27,7 +27,7 @@ htmlview: html
 ## dirhtml        to render PEPs to "index.html" files within "pep-NNNN" directories
 .PHONY: dirhtml
 dirhtml: BUILDER = dirhtml
-dirhtml: venv rss
+dirhtml: venv
 	$(SPHINXBUILD) $(ALLSPHINXOPTS)

 ## fail-warning   to render PEPs to "pep-NNNN.html" files and fail the Sphinx build on any warning
@ -41,11 +41,6 @@ check-links: BUILDER = linkcheck
 check-links: venv
 	$(SPHINXBUILD) $(ALLSPHINXOPTS)

-## rss            to generate the peps.rss file
-.PHONY: rss
-rss: venv
-	$(VENVDIR)/bin/python3 generate_rss.py -o $(OUTPUT_DIR)
-
 ## clean          to remove the venv and build files
 .PHONY: clean
 clean: clean-venv
--- a/generate_rss.py
+++ b/generate_rss.py
@ -1,210 +0,0 @@
-#!/usr/bin/env python3
-# This file is placed in the public domain or under the
-# CC0-1.0-Universal license, whichever is more permissive.
-
-import argparse
-import datetime as dt
-import email.utils
-from html import escape
-from pathlib import Path
-import re
-
-import docutils.frontend
-from docutils import nodes
-from docutils import utils
-from docutils.parsers import rst
-from docutils.parsers.rst import roles
-
-# get the directory with the PEP sources
-PEP_ROOT = Path(__file__).parent
-
-
-def _format_rfc_2822(datetime: dt.datetime) -> str:
-    datetime = datetime.replace(tzinfo=dt.timezone.utc)
-    return email.utils.format_datetime(datetime, usegmt=True)
-
-
-line_cache: dict[Path, dict[str, str]] = {}
-
-# Monkeypatch PEP and RFC reference roles to match Sphinx behaviour
-EXPLICIT_TITLE_RE = re.compile(r'^(.+?)\s*(?<!\x00)<(.*?)>$', re.DOTALL)
-
-
-def _pep_reference_role(role, rawtext, text, lineno, inliner,
-                        options={}, content=[]):
-    matched = EXPLICIT_TITLE_RE.match(text)
-    if matched:
-        title = utils.unescape(matched.group(1))
-        target = utils.unescape(matched.group(2))
-    else:
-        target = utils.unescape(text)
-        title = "PEP " + utils.unescape(text)
-    pep_str, _, fragment = target.partition("#")
-    try:
-        pepnum = int(pep_str)
-        if pepnum < 0 or pepnum > 9999:
-            raise ValueError
-    except ValueError:
-        msg = inliner.reporter.error(
-            f'PEP number must be a number from 0 to 9999; "{pep_str}" is invalid.',
-            line=lineno)
-        prb = inliner.problematic(rawtext, rawtext, msg)
-        return [prb], [msg]
-    # Base URL mainly used by inliner.pep_reference; so this is correct:
-    ref = (inliner.document.settings.pep_base_url
-           + inliner.document.settings.pep_file_url_template % pepnum)
-    if fragment:
-        ref += "#" + fragment
-    roles.set_classes(options)
-    return [nodes.reference(rawtext, title, refuri=ref, **options)], []
-
-
-def _rfc_reference_role(role, rawtext, text, lineno, inliner,
-                        options={}, content=[]):
-    matched = EXPLICIT_TITLE_RE.match(text)
-    if matched:
-        title = utils.unescape(matched.group(1))
-        target = utils.unescape(matched.group(2))
-    else:
-        target = utils.unescape(text)
-        title = "RFC " + utils.unescape(text)
-    pep_str, _, fragment = target.partition("#")
-    try:
-        rfcnum = int(pep_str)
-        if rfcnum < 0 or rfcnum > 9999:
-            raise ValueError
-    except ValueError:
-        msg = inliner.reporter.error(
-            f'RFC number must be a number from 0 to 9999; "{pep_str}" is invalid.',
-            line=lineno)
-        prb = inliner.problematic(rawtext, rawtext, msg)
-        return [prb], [msg]
-    ref = (inliner.document.settings.rfc_base_url + inliner.rfc_url % rfcnum)
-    if fragment:
-        ref += "#" + fragment
-    roles.set_classes(options)
-    return [nodes.reference(rawtext, title, refuri=ref, **options)], []
-
-
-roles.register_canonical_role("pep-reference", _pep_reference_role)
-roles.register_canonical_role("rfc-reference", _rfc_reference_role)
-
-
-def first_line_starting_with(full_path: Path, text: str) -> str:
-    # Try and retrieve from cache
-    if full_path in line_cache:
-        return line_cache[full_path].get(text, "")
-
-    # Else read source
-    line_cache[full_path] = path_cache = {}
-    for line in full_path.open(encoding="utf-8"):
-        if line.startswith("Created:"):
-            path_cache["Created:"] = line.removeprefix("Created:").strip()
-        elif line.startswith("Title:"):
-            path_cache["Title:"] = line.removeprefix("Title:").strip()
-        elif line.startswith("Author:"):
-            path_cache["Author:"] = line.removeprefix("Author:").strip()
-
-        # Once all have been found, exit loop
-        if path_cache.keys == {"Created:", "Title:", "Author:"}:
-            break
-    return path_cache.get(text, "")
-
-
-def pep_creation(full_path: Path) -> dt.datetime:
-    created_str = first_line_starting_with(full_path, "Created:")
-    if full_path.stem == "pep-0102":
-        # remove additional content on the Created line
-        created_str = created_str.split(" ", 1)[0]
-    return dt.datetime.strptime(created_str, "%d-%b-%Y")
-
-
-def parse_rst(full_path: Path) -> nodes.document:
-    text = full_path.read_text(encoding="utf-8")
-    settings = docutils.frontend.get_default_settings(rst.Parser)
-    document = utils.new_document(f'<{full_path}>', settings=settings)
-    rst.Parser(rfc2822=True).parse(text, document)
-    return document
-
-
-def pep_abstract(full_path: Path) -> str:
-    """Return the first paragraph of the PEP abstract"""
-    for node in parse_rst(full_path).findall(nodes.section):
-        if node.next_node(nodes.title).astext() == "Abstract":
-            return node.next_node(nodes.paragraph).astext().strip().replace("\n", " ")
-    return ""
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Generate RSS feed")
-    parser.add_argument(
-        "-o",
-        "--output-dir",
-        default="build",  # synchronise with render.yaml -> deploy step
-        help="Output directory, relative to root. Default 'build'.",
-    )
-    args = parser.parse_args()
-
-    # get list of peps with creation time (from "Created:" string in pep source)
-    peps_with_dt = sorted((pep_creation(path), path) for path in PEP_ROOT.glob("pep-????.???"))
-
-    # generate rss items for 10 most recent peps
-    items = []
-    for datetime, full_path in peps_with_dt[-10:]:
-        try:
-            pep_num = int(full_path.stem.split("-")[-1])
-        except ValueError:
-            continue
-
-        title = first_line_starting_with(full_path, "Title:")
-        author = first_line_starting_with(full_path, "Author:")
-        if "@" in author or " at " in author:
-            parsed_authors = email.utils.getaddresses([author])
-            joined_authors = ", ".join(f"{name} ({email_address})" for name, email_address in parsed_authors)
-        else:
-            joined_authors = author
-        url = f"https://peps.python.org/pep-{pep_num:0>4}/"
-
-        item = f"""\
-    <item>
-      <title>PEP {pep_num}: {escape(title, quote=False)}</title>
-      <link>{escape(url, quote=False)}</link>
-      <description>{escape(pep_abstract(full_path), quote=False)}</description>
-      <author>{escape(joined_authors, quote=False)}</author>
-      <guid isPermaLink="true">{url}</guid>
-      <pubDate>{_format_rfc_2822(datetime)}</pubDate>
-    </item>"""
-        items.append(item)
-
-    # The rss envelope
-    desc = """
-    Newest Python Enhancement Proposals (PEPs) - Information on new
-    language features, and some meta-information like release
-    procedure and schedules.
-    """
-    last_build_date = _format_rfc_2822(dt.datetime.now(dt.timezone.utc))
-    items = "\n".join(reversed(items))
-    output = f"""\
-<?xml version='1.0' encoding='UTF-8'?>
-<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
-  <channel>
-    <title>Newest Python PEPs</title>
-    <link>https://peps.python.org/peps.rss</link>
-    <description>{" ".join(desc.split())}</description>
-    <atom:link href="https://peps.python.org/peps.rss" rel="self"/>
-    <docs>https://cyber.harvard.edu/rss/rss.html</docs>
-    <language>en</language>
-    <lastBuildDate>{last_build_date}</lastBuildDate>
-{items}
-  </channel>
-</rss>
-"""
-
-    # output directory for target HTML files
-    out_dir = PEP_ROOT / args.output_dir
-    out_dir.mkdir(exist_ok=True, parents=True)
-    out_dir.joinpath("peps.rss").write_text(output)
-
-
-if __name__ == "__main__":
-    main()
--- a/pep_sphinx_extensions/init.py
+++ b/pep_sphinx_extensions/init.py
@ -7,6 +7,7 @@ from typing import TYPE_CHECKING
 from docutils.writers.html5_polyglot import HTMLTranslator
 from sphinx import environment

+from pep_sphinx_extensions.generate_rss import create_rss_feed
 from pep_sphinx_extensions.pep_processor.html import pep_html_builder
 from pep_sphinx_extensions.pep_processor.html import pep_html_translator
 from pep_sphinx_extensions.pep_processor.parsing import pep_banner_directive
@ -29,9 +30,7 @@ def _update_config_for_builder(app: Sphinx) -> None:
    if app.builder.name == "dirhtml":
        app.env.settings["pep_url"] = "pep-{:0>4}"

-    # internal_builder exists if Sphinx is run by build.py
-    if "internal_builder" not in app.tags:
-        app.connect("build-finished", _post_build)  # Post-build tasks
+    app.connect("build-finished", _post_build)  # Post-build tasks


 def _post_build(app: Sphinx, exception: Exception | None) -> None:
@ -41,7 +40,11 @@ def _post_build(app: Sphinx, exception: Exception | None) -> None:

    if exception is not None:
        return
-    create_index_file(Path(app.outdir), app.builder.name)
+
+    # internal_builder exists if Sphinx is run by build.py
+    if "internal_builder" not in app.tags:
+        create_index_file(Path(app.outdir), app.builder.name)
+    create_rss_feed(app.doctreedir, app.outdir)


 def setup(app: Sphinx) -> dict[str, bool]:
--- a/pep_sphinx_extensions/generate_rss.py
+++ b/pep_sphinx_extensions/generate_rss.py
@ -0,0 +1,120 @@
+# This file is placed in the public domain or under the
+# CC0-1.0-Universal license, whichever is more permissive.
+
+from __future__ import annotations
+
+import datetime as dt
+import pickle
+from email.utils import format_datetime, getaddresses
+from html import escape
+from pathlib import Path
+
+from docutils import nodes
+
+RSS_DESCRIPTION = (
+    "Newest Python Enhancement Proposals (PEPs): "
+    "Information on new language features "
+    "and some meta-information like release procedure and schedules."
+)
+
+# get the directory with the PEP sources
+PEP_ROOT = Path(__file__).parent
+
+
+def _format_rfc_2822(datetime: dt.datetime) -> str:
+    datetime = datetime.replace(tzinfo=dt.timezone.utc)
+    return format_datetime(datetime, usegmt=True)
+
+
+document_cache: dict[Path, dict[str, str]] = {}
+
+
+def get_from_doctree(full_path: Path, text: str) -> str:
+    # Try and retrieve from cache
+    if full_path in document_cache:
+        return document_cache[full_path].get(text, "")
+
+    # Else load doctree
+    document = pickle.loads(full_path.read_bytes())
+    # Store the headers (populated in the PEPHeaders transform)
+    document_cache[full_path] = path_cache = document.get("headers", {})
+    # Store the Abstract
+    path_cache["Abstract"] = pep_abstract(document)
+    # Return the requested key
+    return path_cache.get(text, "")
+
+
+def pep_creation(full_path: Path) -> dt.datetime:
+    created_str = get_from_doctree(full_path, "Created")
+    try:
+        return dt.datetime.strptime(created_str, "%d-%b-%Y")
+    except ValueError:
+        return dt.datetime.min
+
+
+def pep_abstract(document: nodes.document) -> str:
+    """Return the first paragraph of the PEP abstract"""
+    for node in document.findall(nodes.section):
+        title_node = node.next_node(nodes.title)
+        if title_node is None:
+            continue
+        if title_node.astext() == "Abstract":
+            return node.next_node(nodes.paragraph).astext().strip().replace("\n", " ")
+    return ""
+
+
+def _generate_items(doctree_dir: Path):
+    # get list of peps with creation time (from "Created:" string in pep source)
+    peps_with_dt = sorted((pep_creation(path), path) for path in doctree_dir.glob("pep-????.doctree"))
+
+    # generate rss items for 10 most recent peps (in reverse order)
+    for datetime, full_path in reversed(peps_with_dt[-10:]):
+        try:
+            pep_num = int(get_from_doctree(full_path, "PEP"))
+        except ValueError:
+            continue
+
+        title = get_from_doctree(full_path, "Title")
+        url = f"https://peps.python.org/pep-{pep_num:0>4}/"
+        abstract = get_from_doctree(full_path, "Abstract")
+        author = get_from_doctree(full_path, "Author")
+        if "@" in author or " at " in author:
+            parsed_authors = getaddresses([author])
+            joined_authors = ", ".join(f"{name} ({email_address})" for name, email_address in parsed_authors)
+        else:
+            joined_authors = author
+
+        item = f"""\
+    <item>
+      <title>PEP {pep_num}: {escape(title, quote=False)}</title>
+      <link>{escape(url, quote=False)}</link>
+      <description>{escape(abstract, quote=False)}</description>
+      <author>{escape(joined_authors, quote=False)}</author>
+      <guid isPermaLink="true">{url}</guid>
+      <pubDate>{_format_rfc_2822(datetime)}</pubDate>
+    </item>"""
+        yield item
+
+
+def create_rss_feed(doctree_dir: Path, output_dir: Path):
+    # The rss envelope
+    last_build_date = _format_rfc_2822(dt.datetime.now(dt.timezone.utc))
+    items = "\n".join(_generate_items(Path(doctree_dir)))
+    output = f"""\
+<?xml version='1.0' encoding='UTF-8'?>
+<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
+  <channel>
+    <title>Newest Python PEPs</title>
+    <link>https://peps.python.org/peps.rss</link>
+    <description>{RSS_DESCRIPTION}</description>
+    <atom:link href="https://peps.python.org/peps.rss" rel="self"/>
+    <docs>https://cyber.harvard.edu/rss/rss.html</docs>
+    <language>en</language>
+    <lastBuildDate>{last_build_date}</lastBuildDate>
+{items}
+  </channel>
+</rss>
+"""
+
+    # output directory for target HTML files
+    Path(output_dir, "peps.rss").write_text(output, encoding="utf-8")
--- a/pep_sphinx_extensions/pep_processor/transforms/pep_headers.py
+++ b/pep_sphinx_extensions/pep_processor/transforms/pep_headers.py
@ -72,11 +72,11 @@ class PEPHeaders(transforms.Transform):
            raise PEPParsingError("Document does not contain an RFC-2822 'PEP' header!")

        # Extract PEP number
-        value = pep_field[1].astext()
+        pep_num_str = pep_field[1].astext()
        try:
-            pep_num = int(value)
+            pep_num = int(pep_num_str)
        except ValueError:
-            raise PEPParsingError(f"'PEP' header must contain an integer. '{value}' is invalid!")
+            raise PEPParsingError(f"PEP header must contain an integer. '{pep_num_str}' is invalid!")

        # Special processing for PEP 0.
        if pep_num == 0:
@ -89,7 +89,11 @@ class PEPHeaders(transforms.Transform):
            raise PEPParsingError("No title!")

        fields_to_remove = []
+        self.document["headers"] = headers = {}
        for field in header:
+            row_attributes = {sub.tagname: sub.rawsource for sub in field}
+            headers[row_attributes["field_name"]] = row_attributes["field_body"]
+
            name = field[0].astext().lower()
            body = field[1]
            if len(body) == 0: