From 86af87adc3c04af25964e11e0dd5a45c888e8c54 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 11 Jan 2016 17:18:06 +0100 Subject: [PATCH] PEP 509: minor edits --- pep-0509.txt | 118 +++++++++++++++++++++++++++------------------------ 1 file changed, 63 insertions(+), 55 deletions(-) diff --git a/pep-0509.txt b/pep-0509.txt index fd6707dea..e3cd4c7dc 100644 --- a/pep-0509.txt +++ b/pep-0509.txt @@ -57,9 +57,9 @@ optimizers. Guard example ============= -Pseudo-code of an fast guard to check if a dictionary key was modified +Pseudo-code of an fast guard to check if a dictionary entry was modified (created, updated or deleted) using an hypothetical -``get_dict_version(dict)`` function:: +``dict_get_version(dict)`` function:: UNSET = object() @@ -68,22 +68,26 @@ Pseudo-code of an fast guard to check if a dictionary key was modified self.dict = dict self.key = key self.value = dict.get(key, UNSET) - self.version = get_dict_version(dict) + self.version = dict_get_version(dict) def check(self): - """Return True if the dictionary value did not changed.""" - version = get_dict_version(self.dict) + """Return True if the dictionary entry did not changed.""" + + # read the version field of the dict structure + version = dict_get_version(self.dict) if version == self.version: - # Fast-path: avoid the dictionary lookup + # Fast-path: dictionary lookup avoided return True + # lookup in the dictionary value = self.dict.get(self.key, UNSET) - if value == self.value: + if value is self.value: # another key was modified: # cache the new dictionary version self.version = version return True + # the key was modified return False @@ -139,9 +143,9 @@ Retrospective Changes ======= -Add a ``PY_INT64_T ma_version`` field to the ``PyDictObject`` structure: -64-bit unsigned integer. New empty dictionaries are initilized to -version ``0``. The version is incremented at each change: +Add a ``ma_version`` field to the ``PyDictObject`` structure with the C +type ``PY_INT64_T``, 64-bit unsigned integer. New empty dictionaries are +initilized to version ``0``. The version is incremented at each change: * ``clear()`` if the dict was non-empty * ``pop(key)`` if the key exists @@ -153,39 +157,40 @@ version ``0``. The version is incremented at each change: * ``update(...)`` if new values are different than existing values (the version can be incremented multiple times) -Example using an hypothetical ``get_dict_version(dict)`` function:: +Example using an hypothetical ``dict_get_version(dict)`` function:: >>> d = {} - >>> get_dict_version(d) + >>> dict_get_version(d) 0 >>> d['key'] = 'value' - >>> get_dict_version(d) + >>> dict_get_version(d) 1 >>> d['key'] = 'new value' - >>> get_dict_version(d) + >>> dict_get_version(d) 2 >>> del d['key'] - >>> get_dict_version(d) + >>> dict_get_version(d) 3 If a dictionary is created with items, the version is also incremented at each dictionary insertion. Example:: >>> d=dict(x=7, y=33) - >>> get_dict_version(d) + >>> dict_get_version(d) 2 -The version is not incremented is an existing key is modified to the -same value, but only the identifier of the value is tested, not the -content of the value. Example:: +The version is not incremented if an existing key is set to the same +value. For efficiency, values are compared by their identity: +``new_value is old_value``, not by their content: +``new_value == old_value``. Example:: >>> d={} >>> value = object() >>> d['key'] = value - >>> get_dict_version(d) + >>> dict_get_version(d) 2 >>> d['key'] = value - >>> get_dict_version(d) + >>> dict_get_version(d) 2 .. note:: @@ -207,10 +212,10 @@ any overhead on dictionary operations. When the version does not change, ``PyDict_GetItem()`` takes 14.8 ns for a dictioanry lookup, whereas a guard check only takes 3.8 ns. Moreover, -a guard can watch multiple keys. For example, for an optimization using -10 global variables in a function, the check costs 148 ns for 10 dict -lookups, whereas the guard still only cost 3.8 ns when the version does -not change (39x as fast). +a guard can watch for multiple keys. For example, for an optimization +using 10 global variables in a function, 10 dictionary lookups costs 148 +ns, whereas the guard still only costs 3.8 ns when the version does not +change (39x as fast). Integer overflow @@ -230,7 +235,7 @@ to the old version modulo ``2 ** 64``. If a dictionary is modified each nanosecond, an overflow takes longer than 584 years. Using a 32-bit version, the overflow occurs only after 4 seconds. That's why a 64-bit unsigned type is also used on 32-bit -systems. +systems. A dictionary lookup at the C level takes 14.8 ns. A risk of a bug every 584 years is acceptable. @@ -249,7 +254,7 @@ the ``dict`` API). There are multiple issues: * To be consistent and avoid bad surprises, the version must be added to - all mapping type. Implementing a new mapping type would require extra + all mapping types. Implementing a new mapping type would require extra work for no benefit, since the version is only required on the ``dict`` type in practice. * All Python implementations must implement this new property, it gives @@ -260,11 +265,11 @@ There are multiple issues: ``dict.__version__ == guard_version`` must be used instead to reduce the risk of bug on integer overflow (even if the integer overflow is unlikely in practice). -* Exposing the dictioanry version can lead the +* Exposing the dictionary version at Python level can lead the false assumption on performances. Checking ``dict.__version__`` at - the Python level is not faster than a dictionary lookup. The lookup - has a cost of 48.7 ns and checking a guard has a cost of 47.5 ns, the - difference is only 1.2 ns (3%):: + the Python level is not faster than a dictionary lookup. A dictionary + lookup has a cost of 48.7 ns and checking a guard has a cost of 47.5 + ns, the difference is only 1.2 ns (3%):: $ ./python -m timeit -s 'd = {str(i):i for i in range(100)}' 'd["33"] == 33' @@ -286,53 +291,56 @@ Add a version to each dict entry A single version per dictionary requires to keep a strong reference to the value which can keep the value alive longer than expected. If we add -also a version per dictionary entry, the guard can rely on the entry -version and so avoid the strong reference to the value (only strong -references to a dictionary and key are needed). +also a version per dictionary entry, the guard can only store the entry +version to avoid the strong reference to the value (only strong +references to the dictionary and to the key are needed). -Changes: add a ``getversion(key)`` method to dictionary which returns -``None`` if the key doesn't exist. When a key is created or modified, -the entry version is set to the dictionary version which is incremented -at each change (create, modify, delete). +Changes: add a ``me_version`` field to the ``PyDictKeyEntry`` structure, +the field has the C type ``PY_INT64_T``. When a key is created or +modified, the entry version is set to the dictionary version which is +incremented at any change (create, modify, delete). -Pseudo-code of an fast guard to check if a dict key was modified using -``getversion()``:: +Pseudo-code of an fast guard to check if a dictionary key was modified +using hypothetical ``dict_get_version(dict)`` +``dict_get_entry_version(dict)`` functions:: UNSET = object() - class Guard: + class GuardDictKey: def __init__(self, dict, key): self.dict = dict self.key = key - self.dict_version = get_dict_version(dict) - self.entry_version = dict.getversion(key) + self.dict_version = dict_get_version(dict) + self.entry_version = dict_get_entry_version(dict, key) def check(self): - """Return True if the dictionary value did not changed.""" - dict_version = get_dict_version(self.dict) + """Return True if the dictionary entry did not changed.""" + + # read the version field of the dict structure + dict_version = dict_get_version(self.dict) if dict_version == self.version: - # Fast-path: avoid the dictionary lookup + # Fast-path: dictionary lookup avoided return True - # lookup in the dictionary, but get the entry version, - #not the value - entry_version = self.dict.getversion(self.key) + # lookup in the dictionary + entry_version = get_dict_key_version(dict, key) if entry_version == self.entry_version: # another key was modified: # cache the new dictionary version self.dict_version = dict_version return True + # the key was modified return False -This main drawback of this option is the impact on the memory footprint. +The main drawback of this option is the impact on the memory footprint. It increases the size of each dictionary entry, so the overhead depends on the number of buckets (dictionary entries, used or unused yet). For example, it increases the size of each dictionary entry by 8 bytes on -64-bit system if we use ``size_t``. +64-bit system. -In Python, the memory footprint matters and the trend is more to reduce -it. Examples: +In Python, the memory footprint matters and the trend is to reduce it. +Examples: * `PEP 393 -- Flexible String Representation `_ @@ -351,7 +359,7 @@ Leave the ``dict`` type unchanged to not add any overhead (memory footprint) when guards are not needed. Technical issue: a lot of C code in the wild, including CPython core, -expect the exact ``dict`` type. Issues: +expecting the exact ``dict`` type. Issues: * ``exec()`` requires a ``dict`` for globals and locals. A lot of code use ``globals={}``. It is not possible to cast the ``dict`` to a @@ -371,7 +379,7 @@ Other issues: * The garbage collector has a special code to "untrack" ``dict`` instances. If a ``dict`` subtype is used for namespaces, the garbage - collector may be unable to break some reference cycles. + collector can be unable to break some reference cycles. * Some functions have a fast-path for ``dict`` which would not be taken for ``dict`` subtypes, and so it would make Python a little bit slower.