From a7bd65b904ade5cec30272e61629f4ebb2d11c06 Mon Sep 17 00:00:00 2001 From: Christian Heimes Date: Wed, 13 Nov 2013 23:32:54 +0100 Subject: [PATCH] PEP 456: add some of the new implementation details to the PEP's text --- pep-0456.txt | 110 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 93 insertions(+), 17 deletions(-) diff --git a/pep-0456.txt b/pep-0456.txt index 69ce487f3..be7f8e33b 100644 --- a/pep-0456.txt +++ b/pep-0456.txt @@ -222,6 +222,7 @@ Murmur3 is not resilient against hash collision attacks. Therefore Murmur3 can no longer be considered as secure algorithm. It still may be an alternative is hash collision attacks are of no concern. + CityHash -------- @@ -246,6 +247,12 @@ Like MurmurHash Aumasson, Bernstein and Boßlet have shown [sip]_ a similar weakness in CityHash. +DJBX33A +------- + +TODO + + HMAC, MD5, SHA-1, SHA-2 ----------------------- @@ -268,6 +275,45 @@ SipHash provides the best combination of speed and security. Developers of other prominent projects have came to the same conclusion. +Small string optimization +========================= + +Hash functions like SipHash24 have a costly initialization and finalization +code that can dominate speed of the algorithm for very short strings. On the +other hand Python calculates the hash value of short strings quite often. A +simple and fast function for especially for hashing of small strings can make +a measurably impact on performance. For example these measurements were taken +during a run of Python's regression tests. Additional measurements of other +code have shown a similar distribution. + +===== ============ ======= +bytes hash() calls portion +===== ============ ======= +1 18709 0.2% +2 737480 9.5% +3 636178 17.6% +4 1518313 36.7% +5 643022 44.9% +6 770478 54.6% +7 525150 61.2% +8 304873 65.1% +9 297272 68.8% +10 68191 69.7% +11 1388484 87.2% +12 480786 93.3% +13 52730 93.9% +14 65309 94.8% +15 44245 95.3% +16 85643 96.4% +Total 7921678 +===== ============ ======= + +However a fast function like DJBX33A is not as secure as SipHash24. A cutoff +at about 5 to 7 bytes should provide a decent safety margin and speed up at +the same time. The PEP's reference implementation provides such a cutoff with +``Py_HASH_CUTOFF`` but disables the optimization by default. + + C API additions =============== @@ -277,28 +323,57 @@ hash secret ----------- The ``_Py_HashSecret_t`` type of Python 2.6 to 3.3 has two members with either -32- or 64-bit length each. SipHash requires two 64-bit unsigned integers as keys. -The typedef will be changed to an union with a guaranteed size of 128 bits on -all architectures. On platforms with a 64-bit data type it will have two -``uint64`` members. Because C89 compatible compilers may not have ``uint64`` -the union also has an array of 16 chars. +32- or 64-bit length each. SipHash requires two 64-bit unsigned integers as +keys. The typedef will be changed to an union with a guaranteed size of 24 +bytes on all architectures. The union provides a 128 bit random key for +SipHash24 and FNV as well as an additional value of 64 bit for the optional +small string optimization and pyexpat seed. The additional 64 bit seed ensures +that pyexpat or small string optimization cannot reveal bits of the SipHash24 +seed. + +memory layout on 64 bit systems:: + + cccccccc cccccccc cccccccc uc -- unsigned char[24] + pppppppp ssssssss ........ fnv -- two Py_hash_t + k0k0k0k0 k1k1k1k1 ........ siphash -- two PY_UINT64_T + ........ ........ ssssssss djbx33a -- 16 bytes padding + one Py_hash_t + ........ ........ eeeeeeee pyexpat XML hash salt + +memory layout on 32 bit systems:: + + cccccccc cccccccc cccccccc uc -- unsigned char[24] + ppppssss ........ ........ fnv -- two Py_hash_t + k0k0k0k0 k1k1k1k1 ........ siphash -- two PY_UINT64_T (if available) + ........ ........ ssss.... djbx33a -- 16 bytes padding + one Py_hash_t + ........ ........ eeee.... pyexpat XML hash salt new type definition:: typedef union { - unsigned char uc16[16]; + /* ensure 24 bytes */ + unsigned char uc[24]; + /* two Py_hash_t for FNV */ struct { Py_hash_t prefix; Py_hash_t suffix; - } ht; + } fnv; #ifdef PY_UINT64_T + /* two uint64 for SipHash24 */ struct { PY_UINT64_T k0; PY_UINT64_T k1; - } ui64; + } siphash; #endif + /* a different (!) Py_hash_t for small string optimization */ + struct { + unsigned char padding[16]; + Py_hash_t suffix; + } djbx33a; + struct { + unsigned char padding[16]; + Py_hash_t hashsalt; + } expat; } _Py_HashSecret_t; - PyAPI_DATA(_Py_HashSecret_t) _Py_HashSecret; ``_Py_HashSecret_t`` is initialized in ``Python/random.c:_PyRandom_Init()`` @@ -336,9 +411,9 @@ algorithm in the configure step. hash function selection ----------------------- -The value of the macro ``PY_HASH_ALGORITHM`` defines which hash algorithm is -used internally. It may be set to any of the three values ``PY_HASH_SIPHASH24``, -``PY_HASH_FNV`` or ``PY_HASH_EXTERNAL``. If ``PY_HASH_ALGORITHM`` is not +The value of the macro ``Py_HASH_ALGORITHM`` defines which hash algorithm is +used internally. It may be set to any of the three values ``Py_HASH_SIPHASH24``, +``Py_HASH_FNV`` or ``Py_HASH_EXTERNAL``. If ``Py_HASH_ALGORITHM`` is not defined at all, then the best available algorithm is selected. On platforms wich don't require aligned memory access (``HAVE_ALIGNED_REQUIRED`` not defined) and an unsigned 64bit integer type ``PY_UINT64_T``, SipHash24 is @@ -346,17 +421,17 @@ used. On strict C89 platforms without a 64 bit data type, or architectures such as SPARC, FNV is selected as fallback. A hash algorithm can be selected with an autoconf option, for example ``./configure --with-hash-algorithm=fnv``. -The value ``PY_HASH_EXTERNAL`` allows 3rd parties to provide their own +The value ``Py_HASH_EXTERNAL`` allows 3rd parties to provide their own implementation at compile time. Implementation:: - #if PY_HASH_ALGORITHM == PY_HASH_EXTERNAL + #if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL extern PyHash_FuncDef PyHash_Func; - #elif PY_HASH_ALGORITHM == PY_HASH_SIPHASH24 + #elif Py_HASH_ALGORITHM == Py_HASH_SIPHASH24 static PyHash_FuncDef PyHash_Func = {siphash24, "siphash24", 64, 128}; - #elif PY_HASH_ALGORITHM == PY_HASH_FNV + #elif Py_HASH_ALGORITHM == Py_HASH_FNV static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * sizeof(Py_hash_t), 16 * sizeof(Py_hash_t)}; #endif @@ -381,7 +456,8 @@ to the object to reflect the active hash algorithm and its properties. # new fields: algorithm='siphash24', hash_bits=64, - seed_bits=128) + seed_bits=128, + cutoff=0) Necessary modifications to C code