| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368 |
- /* x64 */
- #if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
- #define SCRYPT_SALSA64_AVX
- asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
- asm_naked_fn(scrypt_ChunkMix_avx)
- a1(push rbp)
- a2(mov rbp, rsp)
- a2(and rsp, ~63)
- a2(sub rsp, 128)
- a2(lea rcx,[rcx*2])
- a2(shl rcx,7)
- a2(lea r9,[rcx-128])
- a2(lea rax,[rsi+r9])
- a2(lea r9,[rdx+r9])
- a2(and rdx, rdx)
- a2(vmovdqa xmm0,[rax+0])
- a2(vmovdqa xmm1,[rax+16])
- a2(vmovdqa xmm2,[rax+32])
- a2(vmovdqa xmm3,[rax+48])
- a2(vmovdqa xmm4,[rax+64])
- a2(vmovdqa xmm5,[rax+80])
- a2(vmovdqa xmm6,[rax+96])
- a2(vmovdqa xmm7,[rax+112])
- a1(jz scrypt_ChunkMix_avx_no_xor1)
- a3(vpxor xmm0,xmm0,[r9+0])
- a3(vpxor xmm1,xmm1,[r9+16])
- a3(vpxor xmm2,xmm2,[r9+32])
- a3(vpxor xmm3,xmm3,[r9+48])
- a3(vpxor xmm4,xmm4,[r9+64])
- a3(vpxor xmm5,xmm5,[r9+80])
- a3(vpxor xmm6,xmm6,[r9+96])
- a3(vpxor xmm7,xmm7,[r9+112])
- a1(scrypt_ChunkMix_avx_no_xor1:)
- a2(xor r9,r9)
- a2(xor r8,r8)
- a1(scrypt_ChunkMix_avx_loop:)
- a2(and rdx, rdx)
- a3(vpxor xmm0,xmm0,[rsi+r9+0])
- a3(vpxor xmm1,xmm1,[rsi+r9+16])
- a3(vpxor xmm2,xmm2,[rsi+r9+32])
- a3(vpxor xmm3,xmm3,[rsi+r9+48])
- a3(vpxor xmm4,xmm4,[rsi+r9+64])
- a3(vpxor xmm5,xmm5,[rsi+r9+80])
- a3(vpxor xmm6,xmm6,[rsi+r9+96])
- a3(vpxor xmm7,xmm7,[rsi+r9+112])
- a1(jz scrypt_ChunkMix_avx_no_xor2)
- a3(vpxor xmm0,xmm0,[rdx+r9+0])
- a3(vpxor xmm1,xmm1,[rdx+r9+16])
- a3(vpxor xmm2,xmm2,[rdx+r9+32])
- a3(vpxor xmm3,xmm3,[rdx+r9+48])
- a3(vpxor xmm4,xmm4,[rdx+r9+64])
- a3(vpxor xmm5,xmm5,[rdx+r9+80])
- a3(vpxor xmm6,xmm6,[rdx+r9+96])
- a3(vpxor xmm7,xmm7,[rdx+r9+112])
- a1(scrypt_ChunkMix_avx_no_xor2:)
- a2(vmovdqa [rsp+0],xmm0)
- a2(vmovdqa [rsp+16],xmm1)
- a2(vmovdqa [rsp+32],xmm2)
- a2(vmovdqa [rsp+48],xmm3)
- a2(vmovdqa [rsp+64],xmm4)
- a2(vmovdqa [rsp+80],xmm5)
- a2(vmovdqa [rsp+96],xmm6)
- a2(vmovdqa [rsp+112],xmm7)
- a2(mov rax,8)
- a1(scrypt_salsa64_avx_loop: )
- a3(vpaddq xmm8, xmm0, xmm2)
- a3(vpaddq xmm9, xmm1, xmm3)
- a3(vpshufd xmm8, xmm8, 0xb1)
- a3(vpshufd xmm9, xmm9, 0xb1)
- a3(vpxor xmm6, xmm6, xmm8)
- a3(vpxor xmm7, xmm7, xmm9)
- a3(vpaddq xmm10, xmm0, xmm6)
- a3(vpaddq xmm11, xmm1, xmm7)
- a3(vpsrlq xmm8, xmm10, 51)
- a3(vpsrlq xmm9, xmm11, 51)
- a3(vpsllq xmm10, xmm10, 13)
- a3(vpsllq xmm11, xmm11, 13)
- a3(vpxor xmm4, xmm4, xmm8)
- a3(vpxor xmm5, xmm5, xmm9)
- a3(vpxor xmm4, xmm4, xmm10)
- a3(vpxor xmm5, xmm5, xmm11)
- a3(vpaddq xmm8, xmm6, xmm4)
- a3(vpaddq xmm9, xmm7, xmm5)
- a3(vpsrlq xmm10, xmm8, 25)
- a3(vpsrlq xmm11, xmm9, 25)
- a3(vpsllq xmm8, xmm8, 39)
- a3(vpsllq xmm9, xmm9, 39)
- a3(vpxor xmm2, xmm2, xmm10)
- a3(vpxor xmm3, xmm3, xmm11)
- a3(vpxor xmm2, xmm2, xmm8)
- a3(vpxor xmm3, xmm3, xmm9)
- a3(vpaddq xmm10, xmm4, xmm2)
- a3(vpaddq xmm11, xmm5, xmm3)
- a3(vpshufd xmm10, xmm10, 0xb1)
- a3(vpshufd xmm11, xmm11, 0xb1)
- a3(vpxor xmm0, xmm0, xmm10)
- a3(vpxor xmm1, xmm1, xmm11)
- a2(vmovdqa xmm8, xmm2)
- a2(vmovdqa xmm9, xmm3)
- a4(vpalignr xmm2, xmm6, xmm7, 8)
- a4(vpalignr xmm3, xmm7, xmm6, 8)
- a4(vpalignr xmm6, xmm9, xmm8, 8)
- a4(vpalignr xmm7, xmm8, xmm9, 8)
- a2(sub rax, 2)
- a3(vpaddq xmm10, xmm0, xmm2)
- a3(vpaddq xmm11, xmm1, xmm3)
- a3(vpshufd xmm10, xmm10, 0xb1)
- a3(vpshufd xmm11, xmm11, 0xb1)
- a3(vpxor xmm6, xmm6, xmm10)
- a3(vpxor xmm7, xmm7, xmm11)
- a3(vpaddq xmm8, xmm0, xmm6)
- a3(vpaddq xmm9, xmm1, xmm7)
- a3(vpsrlq xmm10, xmm8, 51)
- a3(vpsrlq xmm11, xmm9, 51)
- a3(vpsllq xmm8, xmm8, 13)
- a3(vpsllq xmm9, xmm9, 13)
- a3(vpxor xmm5, xmm5, xmm10)
- a3(vpxor xmm4, xmm4, xmm11)
- a3(vpxor xmm5, xmm5, xmm8)
- a3(vpxor xmm4, xmm4, xmm9)
- a3(vpaddq xmm10, xmm6, xmm5)
- a3(vpaddq xmm11, xmm7, xmm4)
- a3(vpsrlq xmm8, xmm10, 25)
- a3(vpsrlq xmm9, xmm11, 25)
- a3(vpsllq xmm10, xmm10, 39)
- a3(vpsllq xmm11, xmm11, 39)
- a3(vpxor xmm2, xmm2, xmm8)
- a3(vpxor xmm3, xmm3, xmm9)
- a3(vpxor xmm2, xmm2, xmm10)
- a3(vpxor xmm3, xmm3, xmm11)
- a3(vpaddq xmm8, xmm5, xmm2)
- a3(vpaddq xmm9, xmm4, xmm3)
- a3(vpshufd xmm8, xmm8, 0xb1)
- a3(vpshufd xmm9, xmm9, 0xb1)
- a3(vpxor xmm0, xmm0, xmm8)
- a3(vpxor xmm1, xmm1, xmm9)
- a2(vmovdqa xmm10, xmm2)
- a2(vmovdqa xmm11, xmm3)
- a4(vpalignr xmm2, xmm6, xmm7, 8)
- a4(vpalignr xmm3, xmm7, xmm6, 8)
- a4(vpalignr xmm6, xmm11, xmm10, 8)
- a4(vpalignr xmm7, xmm10, xmm11, 8)
- a1(ja scrypt_salsa64_avx_loop)
- a3(vpaddq xmm0,xmm0,[rsp+0])
- a3(vpaddq xmm1,xmm1,[rsp+16])
- a3(vpaddq xmm2,xmm2,[rsp+32])
- a3(vpaddq xmm3,xmm3,[rsp+48])
- a3(vpaddq xmm4,xmm4,[rsp+64])
- a3(vpaddq xmm5,xmm5,[rsp+80])
- a3(vpaddq xmm6,xmm6,[rsp+96])
- a3(vpaddq xmm7,xmm7,[rsp+112])
- a2(lea rax,[r8+r9])
- a2(xor r8,rcx)
- a2(and rax,~0xff)
- a2(add r9,128)
- a2(shr rax,1)
- a2(add rax, rdi)
- a2(cmp r9,rcx)
- a2(vmovdqa [rax+0],xmm0)
- a2(vmovdqa [rax+16],xmm1)
- a2(vmovdqa [rax+32],xmm2)
- a2(vmovdqa [rax+48],xmm3)
- a2(vmovdqa [rax+64],xmm4)
- a2(vmovdqa [rax+80],xmm5)
- a2(vmovdqa [rax+96],xmm6)
- a2(vmovdqa [rax+112],xmm7)
- a1(jne scrypt_ChunkMix_avx_loop)
- a2(mov rsp, rbp)
- a1(pop rbp)
- a1(ret)
- asm_naked_fn_end(scrypt_ChunkMix_avx)
- #endif
- /* intrinsic */
- #if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_AVX)
- #define SCRYPT_SALSA64_AVX
- static void asm_calling_convention
- scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
- uint32_t i, blocksPerChunk = r * 2, half = 0;
- xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
- size_t rounds;
- /* 1: X = B_{2r - 1} */
- xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
- x0 = xmmp[0];
- x1 = xmmp[1];
- x2 = xmmp[2];
- x3 = xmmp[3];
- x4 = xmmp[4];
- x5 = xmmp[5];
- x6 = xmmp[6];
- x7 = xmmp[7];
- if (Bxor) {
- xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
- x0 = _mm_xor_si128(x0, xmmp[0]);
- x1 = _mm_xor_si128(x1, xmmp[1]);
- x2 = _mm_xor_si128(x2, xmmp[2]);
- x3 = _mm_xor_si128(x3, xmmp[3]);
- x4 = _mm_xor_si128(x4, xmmp[4]);
- x5 = _mm_xor_si128(x5, xmmp[5]);
- x6 = _mm_xor_si128(x6, xmmp[6]);
- x7 = _mm_xor_si128(x7, xmmp[7]);
- }
- /* 2: for i = 0 to 2r - 1 do */
- for (i = 0; i < blocksPerChunk; i++, half ^= r) {
- /* 3: X = H(X ^ B_i) */
- xmmp = (xmmi *)scrypt_block(Bin, i);
- x0 = _mm_xor_si128(x0, xmmp[0]);
- x1 = _mm_xor_si128(x1, xmmp[1]);
- x2 = _mm_xor_si128(x2, xmmp[2]);
- x3 = _mm_xor_si128(x3, xmmp[3]);
- x4 = _mm_xor_si128(x4, xmmp[4]);
- x5 = _mm_xor_si128(x5, xmmp[5]);
- x6 = _mm_xor_si128(x6, xmmp[6]);
- x7 = _mm_xor_si128(x7, xmmp[7]);
- if (Bxor) {
- xmmp = (xmmi *)scrypt_block(Bxor, i);
- x0 = _mm_xor_si128(x0, xmmp[0]);
- x1 = _mm_xor_si128(x1, xmmp[1]);
- x2 = _mm_xor_si128(x2, xmmp[2]);
- x3 = _mm_xor_si128(x3, xmmp[3]);
- x4 = _mm_xor_si128(x4, xmmp[4]);
- x5 = _mm_xor_si128(x5, xmmp[5]);
- x6 = _mm_xor_si128(x6, xmmp[6]);
- x7 = _mm_xor_si128(x7, xmmp[7]);
- }
- t0 = x0;
- t1 = x1;
- t2 = x2;
- t3 = x3;
- t4 = x4;
- t5 = x5;
- t6 = x6;
- t7 = x7;
- for (rounds = 8; rounds; rounds -= 2) {
- z0 = _mm_add_epi64(x0, x2);
- z1 = _mm_add_epi64(x1, x3);
- z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
- z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
- x6 = _mm_xor_si128(x6, z0);
- x7 = _mm_xor_si128(x7, z1);
- z0 = _mm_add_epi64(x6, x0);
- z1 = _mm_add_epi64(x7, x1);
- z2 = _mm_srli_epi64(z0, 64-13);
- z3 = _mm_srli_epi64(z1, 64-13);
- z0 = _mm_slli_epi64(z0, 13);
- z1 = _mm_slli_epi64(z1, 13);
- x4 = _mm_xor_si128(x4, z2);
- x5 = _mm_xor_si128(x5, z3);
- x4 = _mm_xor_si128(x4, z0);
- x5 = _mm_xor_si128(x5, z1);
- z0 = _mm_add_epi64(x4, x6);
- z1 = _mm_add_epi64(x5, x7);
- z2 = _mm_srli_epi64(z0, 64-39);
- z3 = _mm_srli_epi64(z1, 64-39);
- z0 = _mm_slli_epi64(z0, 39);
- z1 = _mm_slli_epi64(z1, 39);
- x2 = _mm_xor_si128(x2, z2);
- x3 = _mm_xor_si128(x3, z3);
- x2 = _mm_xor_si128(x2, z0);
- x3 = _mm_xor_si128(x3, z1);
- z0 = _mm_add_epi64(x2, x4);
- z1 = _mm_add_epi64(x3, x5);
- z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
- z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
- x0 = _mm_xor_si128(x0, z0);
- x1 = _mm_xor_si128(x1, z1);
- z0 = x2;
- z1 = x3;
- x2 = _mm_alignr_epi8(x6, x7, 8);
- x3 = _mm_alignr_epi8(x7, x6, 8);
- x6 = _mm_alignr_epi8(z1, z0, 8);
- x7 = _mm_alignr_epi8(z0, z1, 8);
- z0 = _mm_add_epi64(x0, x2);
- z1 = _mm_add_epi64(x1, x3);
- z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
- z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
- x6 = _mm_xor_si128(x6, z0);
- x7 = _mm_xor_si128(x7, z1);
- z0 = _mm_add_epi64(x6, x0);
- z1 = _mm_add_epi64(x7, x1);
- z2 = _mm_srli_epi64(z0, 64-13);
- z3 = _mm_srli_epi64(z1, 64-13);
- z0 = _mm_slli_epi64(z0, 13);
- z1 = _mm_slli_epi64(z1, 13);
- x5 = _mm_xor_si128(x5, z2);
- x4 = _mm_xor_si128(x4, z3);
- x5 = _mm_xor_si128(x5, z0);
- x4 = _mm_xor_si128(x4, z1);
- z0 = _mm_add_epi64(x5, x6);
- z1 = _mm_add_epi64(x4, x7);
- z2 = _mm_srli_epi64(z0, 64-39);
- z3 = _mm_srli_epi64(z1, 64-39);
- z0 = _mm_slli_epi64(z0, 39);
- z1 = _mm_slli_epi64(z1, 39);
- x2 = _mm_xor_si128(x2, z2);
- x3 = _mm_xor_si128(x3, z3);
- x2 = _mm_xor_si128(x2, z0);
- x3 = _mm_xor_si128(x3, z1);
- z0 = _mm_add_epi64(x2, x5);
- z1 = _mm_add_epi64(x3, x4);
- z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
- z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
- x0 = _mm_xor_si128(x0, z0);
- x1 = _mm_xor_si128(x1, z1);
- z0 = x2;
- z1 = x3;
- x2 = _mm_alignr_epi8(x6, x7, 8);
- x3 = _mm_alignr_epi8(x7, x6, 8);
- x6 = _mm_alignr_epi8(z1, z0, 8);
- x7 = _mm_alignr_epi8(z0, z1, 8);
- }
- x0 = _mm_add_epi64(x0, t0);
- x1 = _mm_add_epi64(x1, t1);
- x2 = _mm_add_epi64(x2, t2);
- x3 = _mm_add_epi64(x3, t3);
- x4 = _mm_add_epi64(x4, t4);
- x5 = _mm_add_epi64(x5, t5);
- x6 = _mm_add_epi64(x6, t6);
- x7 = _mm_add_epi64(x7, t7);
- /* 4: Y_i = X */
- /* 6: B'[0..r-1] = Y_even */
- /* 6: B'[r..2r-1] = Y_odd */
- xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
- xmmp[0] = x0;
- xmmp[1] = x1;
- xmmp[2] = x2;
- xmmp[3] = x3;
- xmmp[4] = x4;
- xmmp[5] = x5;
- xmmp[6] = x6;
- xmmp[7] = x7;
- }
- }
- #endif
- #if defined(SCRYPT_SALSA64_AVX)
- /* uses salsa64_core_tangle_sse2 */
-
- #undef SCRYPT_MIX
- #define SCRYPT_MIX "Salsa64/8-AVX"
- #undef SCRYPT_SALSA64_INCLUDED
- #define SCRYPT_SALSA64_INCLUDED
- #endif
|