// Written in the D programming language. /** * Computes SHA1 digests of arbitrary data, using an optimized algorithm with SSSE3 instructions. * * Authors: * The general idea is described by Dean Gaudet. * Another important observation is published by Max Locktyukhin. * (Both implementations are public domain.) * Translation to X86 and D by Kai Nacke * * References: * $(LINK2 http://arctic.org/~dean/crypto/sha1.html) * $(LINK2 http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/, Fast implementation of SHA1) */ module std.internal.digest.sha_SSSE3; import std.conv; version(D_PIC) { // Do not use (Bug9378). } else version(D_InlineAsm_X86) { private version = USE_SSSE3; private version = _32Bit; } else version(D_InlineAsm_X86_64) { private version = USE_SSSE3; private version = _64Bit; } /* * The idea is quite simple. The SHA-1 specification defines the following message schedule: * W[i] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1 * * To employ SSE, simply write down the formula four times: * W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1 * W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1 * W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1 * W[i+3] = (W[i ] ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1 * The last formula requires value W[i] computed with the first formula. * Because the xor operation and the rotate operation are commutative, we can replace the * last formula with * W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1 * and then calculate * W[i+3] ^= W[i] rol 1 * which unfortunately requires many additional operations. This approach was described by * Dean Gaudet. * * Max Locktyukhin observed that * W[i] = W[i-A] ^ W[i-B] * is equivalent to * W[i] = W[i-2*A] ^ W[i-2*B] * (if the indices are still in valid ranges). Using this observation, the formula is * translated to * W[i] = (W[i-6] ^ W[i-16] ^ W[i-28] ^ W[i-32]) rol 2 * Again, to employ SSE the formula is used four times. * * Later on, the expression W[i] + K(i) is used. (K(i) is the constant used in round i.) * Once the 4 W[i] are calculated, we can also add the four K(i) values with one SSE instruction. * * The 32bit and 64bit implementations are almost identical. The main difference is that there * are only 8 XMM registers in 32bit mode. Therefore, space on the stack is needed to save * computed values. */ version(USE_SSSE3) { /* * The general idea is to use the XMM registers as a sliding window over * message schedule. XMM0 to XMM7 are used to store the last 64 byte of * the message schedule. In 64 bit mode this is fine because of the number of * registers. The main difference of the 32 bit code is that a part of the * calculated message schedule is saved on the stack because 2 temporary * registers are needed. */ /* Number of message words we are precalculating. */ private immutable int PRECALC_AHEAD = 16; /* T1 and T2 are used for intermediate results of computations. */ private immutable string T1 = "EAX"; private immutable string T2 = "EBX"; /* The registers used for the SHA-1 variables. */ private immutable string A = "ECX"; private immutable string B = "ESI"; private immutable string C = "EDI"; private immutable string D = "EBP"; private immutable string E = "EDX"; /* */ version(_32Bit) { private immutable string SP = "ESP"; private immutable string BUFFER_PTR = "EAX"; private immutable string STATE_PTR = "EBX"; // Control byte for shuffle instruction (only used in round 0-15) private immutable string X_SHUFFLECTL = "XMM6"; // Round constant (only used in round 0-15) private immutable string X_CONSTANT = "XMM7"; } version(_64Bit) { private immutable string SP = "RSP"; private immutable string BUFFER_PTR = "R9"; private immutable string STATE_PTR = "R8"; // Registers for temporary results (XMM10 and XMM11 are also used temporary) private immutable string W_TMP = "XMM8"; private immutable string W_TMP2 = "XMM9"; // Control byte for shuffle instruction (only used in round 0-15) private immutable string X_SHUFFLECTL = "XMM12"; // Round constant private immutable string X_CONSTANT = "XMM13"; } /* The control words for the byte shuffle instruction. */ align(16) private immutable uint[4] bswap_shufb_ctl = [ 0x0001_0203, 0x0405_0607, 0x0809_0a0b, 0x0c0d_0e0f ]; /* The round constants. */ align(16) private immutable uint[16] constants = [ // Constants for round 0-19 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999, // Constants for round 20-39 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, // Constants for round 40-59 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, // Constants for round 60-79 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 ]; /** Simple version to produce numbers < 100 as string. */ private nothrow pure string to_string(uint i) { assert(i < 100); string s; if (i >= 10) s ~= cast(char)('0' + (i / 10) % 10); return s ~ cast(char)('0' + i % 10); } /** Returns the reference to constant used in round i. */ private nothrow pure string constant(uint i) { return "[constants + 16*"~to_string(i/20)~"]"; } /** Returns the XMM register number used in round i */ private nothrow pure uint regno(uint i) { return (i/4)&7; } /** Returns reference to storage of vector W[i..i+4]. */ private nothrow pure string WiV(uint i) { return "["~SP~" + WI_PTR + "~to_string((i/4)&7)~"*16]"; } /** Returns reference to storage of vector (W + K)[i..i+4]. */ private nothrow pure string WiKiV(uint i) { return "["~SP~" + WI_PLUS_KI_PTR + "~to_string((i/4)&3)~"*16]"; } /** Returns reference to storage of value W[i] + K[i]. */ private nothrow pure string WiKi(uint i) { return "["~SP~" + WI_PLUS_KI_PTR + 4*"~to_string(i&15)~"]"; } /** * Chooses the instruction sequence based on the 32bit or 64bit model. */ private nothrow pure string[] swt3264(string[] insn32, string[] insn64) { version(_32Bit) { return insn32; } version(_64Bit) { return insn64; } } /** * Flattens the instruction sequence and wraps it in an asm block. */ private nothrow pure string wrap(string[] insn) { string s = "asm {"; foreach (t; insn) s ~= (t ~ "; \n"); s ~= "}"; return s; // Is not CTFE: // return "asm { " ~ join(insn, "; \n") ~ "}"; } /** * Weaves the 2 instruction sequences together. */ private nothrow pure string[] weave(string[] seq1, string[] seq2, uint dist = 1) { import std.algorithm : min; string[] res = []; auto i1 = 0, i2 = 0; while (i1 < seq1.length || i2 < seq2.length) { if (i2 < seq2.length) { res ~= seq2[i2..i2+1]; i2 += 1; } if (i1 < seq1.length) { res ~= seq1[i1..std.algorithm.min(i1+dist,$)]; i1 += dist; } } return res; } /** * Generates instructions to load state from memory into registers. */ private nothrow pure string[] loadstate(string base, string a, string b, string c, string d, string e) { return ["mov "~a~",["~base~" + 0*4]", "mov "~b~",["~base~" + 1*4]", "mov "~c~",["~base~" + 2*4]", "mov "~d~",["~base~" + 3*4]", "mov "~e~",["~base~" + 4*4]" ]; } /** * Generates instructions to update state from registers, saving result in memory. */ private nothrow pure string[] savestate(string base, string a, string b, string c, string d, string e) { return ["add ["~base~" + 0*4],"~a, "add ["~base~" + 1*4],"~b, "add ["~base~" + 2*4],"~c, "add ["~base~" + 3*4],"~d, "add ["~base~" + 4*4],"~e ]; } /** Calculates Ch(x, y, z) = z ^ (x & (y ^ z)) */ private nothrow pure string[] Ch(string x, string y, string z) { return ["mov "~T1~","~y, "xor "~T1~","~z, "and "~T1~","~x, "xor "~T1~","~z ]; } /** Calculates Parity(x, y, z) = x ^ y ^ z */ private nothrow pure string[] Parity(string x, string y, string z) { return ["mov "~T1~","~z, "xor "~T1~","~y, "xor "~T1~","~x ]; } /** Calculates Maj(x, y, z) = (x & y) | (z & (x ^ y)) */ private nothrow pure string[] Maj(string x, string y, string z) { return ["mov "~T1~","~y, "mov "~T2~","~x, "or "~T1~","~x, "and "~T2~","~y, "and "~T1~","~z, "or "~T1~","~T2 ]; } /** Returns function for round i. Function returns result in T1 and may destroy T2. */ private nothrow pure string[] F(int i, string b, string c, string d) { string[] insn; if (i >= 0 && i <= 19) insn = Ch(b, c, d); else if (i >= 20 && i <= 39) insn = Parity(b, c, d); else if (i >= 40 && i <= 59) insn = Maj(b, c, d); else if (i >= 60 && i <= 79) insn = Parity(b, c, d); else assert(false, "Coding error"); return insn; } /** Returns instruction used to setup a round. */ private nothrow pure string[] xsetup(int i) { if (i == 0) { return swt3264(["movdqa "~X_SHUFFLECTL~",[bswap_shufb_ctl]", "movdqa "~X_CONSTANT~","~constant(i)], ["movdqa "~X_SHUFFLECTL~",[bswap_shufb_ctl]", "movdqa "~X_CONSTANT~","~constant(i)]); } version(_64Bit) { if (i%20 == 0) { return ["movdqa "~X_CONSTANT~","~constant(i)]; } } return []; } /** * Loads the message words and performs the little to big endian conversion. * Requires that the shuffle control word and the round constant is loaded * into required XMM register. The BUFFER_PTR register must point to the * buffer. */ private nothrow pure string[] precalc_00_15(int i) { int regno = regno(i); string W = "XMM" ~ to_string(regno); version(_32Bit) { string W_TMP = "XMM" ~ to_string(regno+2); } version(_64Bit) { string W_TMP = "XMM" ~ to_string(regno+8); } if ((i & 3) == 0) { return ["movdqu "~W~",["~BUFFER_PTR~" + "~to_string(regno)~"*16]"]; } else if ((i & 3) == 1) { return ["pshufb "~W~","~X_SHUFFLECTL] ~ swt3264(["movdqa "~WiV(i)~","~W], []); } else if ((i & 3) == 2) { return ["movdqa "~W_TMP~","~W, "paddd "~W_TMP~","~X_CONSTANT, ]; } else { return ["movdqa "~WiKiV(i)~","~W_TMP, ]; } } /** * Done on 4 consequtive W[i] values in a single XMM register * W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1 * W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1 * W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1 * W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1 * * This additional calculation unfortunately requires many additional operations * W[i+3] ^= W[i] rol 1 * * Once we have 4 W[i] values in XMM we can also add four K values with one instruction * W[i:i+3] += {K,K,K,K} */ private nothrow pure string[] precalc_16_31(int i) { int regno = regno(i); string W = "XMM" ~ to_string(regno); string W_minus_4 = "XMM" ~ to_string((regno-1)&7); string W_minus_8 = "XMM" ~ to_string((regno-2)&7); string W_minus_12 = "XMM" ~ to_string((regno-3)&7); string W_minus_16 = "XMM" ~ to_string((regno-4)&7); version(_32Bit) { string W_TMP = "XMM" ~ to_string((regno+1)&7); string W_TMP2 = "XMM" ~ to_string((regno+2)&7); } if ((i & 3) == 0) { return ["movdqa "~W~","~W_minus_12, "palignr "~W~","~W_minus_16~",8", // W[i] = W[i-14] "pxor "~W~","~W_minus_16, // W[i] ^= W[i-16] "pxor "~W~","~W_minus_8, // W[i] ^= W[i-8] "movdqa "~W_TMP~","~W_minus_4, ]; } else if ((i & 3) == 1) { return ["psrldq "~W_TMP~",4", // W[i-3] "pxor "~W~","~W_TMP, // W[i] ^= W[i-3] "movdqa "~W_TMP~","~W, "psrld "~W~",31", "pslld "~W_TMP~",1", ]; } else if ((i & 3) == 2) { return ["por "~W~","~W_TMP, "movdqa "~W_TMP~","~W, "pslldq "~W_TMP~",12", "movdqa "~W_TMP2~","~W_TMP, "pslld "~W_TMP~",1", ]; } else { return ["psrld "~W_TMP2~",31", "por "~W_TMP~","~W_TMP2, "pxor "~W~","~W_TMP, "movdqa "~W_TMP~","~W ] ~ swt3264(["movdqa "~WiV(i)~","~W, "paddd "~W_TMP~","~constant(i) ], ["paddd "~W_TMP~","~X_CONSTANT ]) ~ ["movdqa "~WiKiV(i)~","~W_TMP]; } } /** Performs the main calculation as decribed above. */ private nothrow pure string[] precalc_32_79(int i) { int regno = regno(i); string W = "XMM" ~ to_string(regno); string W_minus_4 = "XMM" ~ to_string((regno-1)&7); string W_minus_8 = "XMM" ~ to_string((regno-2)&7); string W_minus_16 = "XMM" ~ to_string((regno-4)&7); version(_32Bit) { string W_minus_28 = "[ESP + WI_PTR + "~ to_string((regno-7)&7)~"*16]"; string W_minus_32 = "[ESP + WI_PTR + "~ to_string((regno-8)&7)~"*16]"; string W_TMP = "XMM" ~ to_string((regno+1)&7); string W_TMP2 = "XMM" ~ to_string((regno+2)&7); } version(_64Bit) { string W_minus_28 = "XMM" ~ to_string((regno-7)&7); string W_minus_32 = "XMM" ~ to_string((regno-8)&7); } if ((i & 3) == 0) { return swt3264(["movdqa "~W~","~W_minus_32], []) ~ ["movdqa "~W_TMP~","~W_minus_4, "pxor "~W~","~W_minus_28, // W is W_minus_32 before xor "palignr "~W_TMP~","~W_minus_8~",8", ]; } else if ((i & 3) == 1) { return ["pxor "~W~","~W_minus_16, "pxor "~W~","~W_TMP, "movdqa "~W_TMP~","~W, ]; } else if ((i & 3) == 2) { return ["psrld "~W~",30", "pslld "~W_TMP~",2", "por "~W_TMP~","~W, ]; } else { if (i < 76) return ["movdqa "~W~","~W_TMP] ~ swt3264(["movdqa "~WiV(i)~","~W, "paddd "~W_TMP~","~constant(i)], ["paddd "~W_TMP~","~X_CONSTANT]) ~ ["movdqa "~WiKiV(i)~","~W_TMP]; else return swt3264(["paddd "~W_TMP~","~constant(i)], ["paddd "~W_TMP~","~X_CONSTANT]) ~ ["movdqa "~WiKiV(i)~","~W_TMP]; } } /** Choose right precalc method. */ private nothrow pure string[] precalc(int i) { if (i >= 0 && i < 16) return precalc_00_15(i); if (i >= 16 && i < 32) return precalc_16_31(i); if (i >= 32 && i < 80) return precalc_32_79(i); return []; } /** * Return code for round i and i+1. * Performs the following rotation: * in=>out: A=>D, B=>E, C=>A, D=>B, E=>C */ private nothrow pure string[] round(int i, string a, string b, string c, string d, string e) { return xsetup(PRECALC_AHEAD + i) ~ weave(F(i, b, c, d) ~ // Returns result in T1; may destroy T2 ["add "~e~","~WiKi(i), "ror "~b~",2", "mov "~T2~","~a, "add "~d~","~WiKi(i+1), "rol "~T2~",5", "add "~e~","~T1 ], precalc(PRECALC_AHEAD + i), 2) ~ weave( ["add "~T2~","~e, // T2 = (A <<< 5) + F(B, C, D) + Wi + Ki + E "mov "~e~","~T2, "rol "~T2~",5", "add "~d~","~T2 ] ~ F(i+1, a, b, c) ~ // Returns result in T1; may destroy T2 ["add "~d~","~T1, "ror "~a~",2"], precalc(PRECALC_AHEAD + i+1), 2); } // Offset into stack (see below) version(_32Bit) { private enum { STATE_OFS = 4, WI_PLUS_KI_PTR = 8, WI_PTR = 72 }; } version(_64Bit) { private enum { WI_PLUS_KI_PTR = 0 }; } /** The prologue sequence. */ private nothrow pure string[] prologue() { version(_32Bit) { /* * Parameters: * EAX contains pointer to input buffer * * Stack layout as follows: * +----------------+ * | ptr to state | * +----------------+ * | return address | * +----------------+ * | EBP | * +----------------+ * | ESI | * +----------------+ * | EDI | * +----------------+ * | EBX | * +----------------+ * | Space for | * | Wi | <- ESP+72 * +----------------+ * | Space for | * | Wi+Ki | <- ESP+8 * +----------------+ <- 16byte aligned * | ptr to state | <- ESP+4 * +----------------+ * | old ESP | <- ESP * +----------------+ */ static assert(BUFFER_PTR == "EAX"); static assert(STATE_PTR == "EBX"); return [// Save registers according to calling convention "push EBP", "push ESI", "push EDI", "push EBX", // Load parameters "mov EBX, [ESP + 5*4]", //pointer to state // Align stack "mov EBP, ESP", "sub ESP, 4*16 + 8*16", "and ESP, 0xffff_fff0", "push EBX", "push EBP", ]; } version(_64Bit) { /* * Parameters: * RSI contains pointer to state * RDI contains pointer to input buffer * * Stack layout as follows: * +----------------+ * | return address | * +----------------+ * | RBP | * +----------------+ * | RBX | * +----------------+ * | Unused | * +----------------+ * | Space for | * | Wi+Ki | <- RSP * +----------------+ <- 16byte aligned */ return [// Save registers according to calling convention "push RBP", "push RBX", // Save parameters "mov "~STATE_PTR~", RSI", //pointer to state "mov "~BUFFER_PTR~", RDI", //pointer to buffer // Align stack "sub RSP, 4*16+8", ]; } } /** * The epilogue sequence. Just pop the saved registers from stack and return to caller. */ private nothrow pure string[] epilogue() { version(_32Bit) { return ["pop ESP", "pop EBX", "pop EDI", "pop ESI", "pop EBP", "ret 4", ]; } version(_64Bit) { return ["add RSP,4*16+8", "pop RBX", "pop RBP", "ret 0", ]; } } /** * */ public nothrow pure void transformSSSE3(uint[5]* state, const(ubyte[64])* buffer) { mixin(wrap(["naked;"] ~ prologue())); // Precalc first 4*16=64 bytes mixin(wrap(xsetup(0))); mixin(wrap(weave(precalc(0)~precalc(1)~precalc(2)~precalc(3), precalc(4)~precalc(5)~precalc(6)~precalc(7)))); mixin(wrap(weave(loadstate(STATE_PTR, A, B, C, D, E), weave(precalc(8)~precalc(9)~precalc(10)~precalc(11), precalc(12)~precalc(13)~precalc(14)~precalc(15))))); // Round 1 mixin(wrap(round( 0, A, B, C, D, E))); mixin(wrap(round( 2, D, E, A, B, C))); mixin(wrap(round( 4, B, C, D, E, A))); mixin(wrap(round( 6, E, A, B, C, D))); mixin(wrap(round( 8, C, D, E, A, B))); mixin(wrap(round(10, A, B, C, D, E))); mixin(wrap(round(12, D, E, A, B, C))); mixin(wrap(round(14, B, C, D, E, A))); mixin(wrap(round(16, E, A, B, C, D))); mixin(wrap(round(18, C, D, E, A, B))); // Round 2 mixin(wrap(round(20, A, B, C, D, E))); mixin(wrap(round(22, D, E, A, B, C))); mixin(wrap(round(24, B, C, D, E, A))); mixin(wrap(round(26, E, A, B, C, D))); mixin(wrap(round(28, C, D, E, A, B))); mixin(wrap(round(30, A, B, C, D, E))); mixin(wrap(round(32, D, E, A, B, C))); mixin(wrap(round(34, B, C, D, E, A))); mixin(wrap(round(36, E, A, B, C, D))); mixin(wrap(round(38, C, D, E, A, B))); // Round 3 mixin(wrap(round(40, A, B, C, D, E))); mixin(wrap(round(42, D, E, A, B, C))); mixin(wrap(round(44, B, C, D, E, A))); mixin(wrap(round(46, E, A, B, C, D))); mixin(wrap(round(48, C, D, E, A, B))); mixin(wrap(round(50, A, B, C, D, E))); mixin(wrap(round(52, D, E, A, B, C))); mixin(wrap(round(54, B, C, D, E, A))); mixin(wrap(round(56, E, A, B, C, D))); mixin(wrap(round(58, C, D, E, A, B))); // Round 4 mixin(wrap(round(60, A, B, C, D, E))); mixin(wrap(round(62, D, E, A, B, C))); mixin(wrap(round(64, B, C, D, E, A))); mixin(wrap(round(66, E, A, B, C, D))); mixin(wrap(round(68, C, D, E, A, B))); mixin(wrap(round(70, A, B, C, D, E))); mixin(wrap(round(72, D, E, A, B, C))); mixin(wrap(round(74, B, C, D, E, A))); mixin(wrap(round(76, E, A, B, C, D))); mixin(wrap(round(78, C, D, E, A, B))); version(_32Bit) { // Load pointer to state mixin(wrap(["mov "~STATE_PTR~",[ESP + STATE_OFS]"])); } mixin(wrap(savestate(STATE_PTR, A, B, C, D, E))); mixin(wrap(epilogue())); } }