117117 .set T1, REG_T1
118118.endm
119119
120- #define K_BASE %r8
121120#define HASH_PTR %r9
121+ #define BLOCKS_CTR %r8
122122#define BUFFER_PTR %r10
123123#define BUFFER_PTR2 %r13
124- #define BUFFER_END %r11
125124
126125#define PRECALC_BUF %r14
127126#define WK_BUF %r15
205204 * blended AVX2 and ALU instruction scheduling
206205 * 1 vector iteration per 8 rounds
207206 */
208- vmovdqu (( i * 2 ) + PRECALC_OFFSET )(BUFFER_PTR), W_TMP
207+ vmovdqu (i * 2 )(BUFFER_PTR), W_TMP
209208 .elseif ((i & 7 ) == 1 )
210- vinsertf128 $1 , ((( i-1 ) * 2 )+PRECALC_OFFSET )(BUFFER_PTR2),\
209+ vinsertf128 $1 , ((i-1 ) * 2 )(BUFFER_PTR2),\
211210 WY_TMP, WY_TMP
212211 .elseif ((i & 7 ) == 2 )
213212 vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
214213 .elseif ((i & 7 ) == 4 )
215- vpaddd K_XMM(K_BASE ), WY, WY_TMP
214+ vpaddd K_XMM + K_XMM_AR( %rip ), WY, WY_TMP
216215 .elseif ((i & 7 ) == 7 )
217216 vmovdqu WY_TMP, PRECALC_WK(i&~7 )
218217
255254 vpxor WY, WY_TMP, WY_TMP
256255 .elseif ((i & 7 ) == 7 )
257256 vpxor WY_TMP2, WY_TMP, WY
258- vpaddd K_XMM(K_BASE ), WY, WY_TMP
257+ vpaddd K_XMM + K_XMM_AR( %rip ), WY, WY_TMP
259258 vmovdqu WY_TMP, PRECALC_WK(i&~7 )
260259
261260 PRECALC_ROTATE_WY
291290 vpsrld $30 , WY, WY
292291 vpor WY, WY_TMP, WY
293292 .elseif ((i & 7 ) == 7 )
294- vpaddd K_XMM(K_BASE ), WY, WY_TMP
293+ vpaddd K_XMM + K_XMM_AR( %rip ), WY, WY_TMP
295294 vmovdqu WY_TMP, PRECALC_WK(i&~7 )
296295
297296 PRECALC_ROTATE_WY
446445
447446.endm
448447
448+ /* Add constant only if (%2 > %3) condition met (uses RTA as temp)
449+ * %1 + %2 >= %3 ? %4 : 0
450+ */
451+ .macro ADD_IF_GE a, b, c, d
452+ mov \a, RTA
453+ add $\d, RTA
454+ cmp $\c, \b
455+ cmovge RTA, \a
456+ .endm
457+
449458/*
450459 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
451460 */
463472 lea (2*4*80 +32 )(%rsp ), WK_BUF
464473
465474 # Precalc WK for first 2 blocks
466- PRECALC_OFFSET = 0
475+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2 , 64
467476 .set i, 0
468477 .rept 160
469478 PRECALC i
470479 .set i, i + 1
471480 .endr
472- PRECALC_OFFSET = 128
481+
482+ /* Go to next block if needed */
483+ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3 , 128
484+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4 , 128
473485 xchg WK_BUF, PRECALC_BUF
474486
475487 .align 32
@@ -479,8 +491,8 @@ _loop:
479491 * we use K_BASE value as a signal of a last block,
480492 * it is set below by: cmovae BUFFER_PTR, K_BASE
481493 */
482- cmp K_BASE, BUFFER_PTR
483- jne _begin
494+ test BLOCKS_CTR, BLOCKS_CTR
495+ jnz _begin
484496 .align 32
485497 jmp _end
486498 .align 32
@@ -512,10 +524,10 @@ _loop0:
512524 .set j, j+2
513525 .endr
514526
515- add $(2* 64 ), BUFFER_PTR /* move to next odd-64-byte block */
516- cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */
517- cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
518-
527+ /* Update Counter */
528+ sub $1 , BLOCKS_CTR
529+ /* Move to the next block only if needed */
530+ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4 , 128
519531 /*
520532 * rounds
521533 * 60,62,64,66,68
@@ -532,8 +544,8 @@ _loop0:
532544 UPDATE_HASH 12 (HASH_PTR), D
533545 UPDATE_HASH 16 (HASH_PTR), E
534546
535- cmp K_BASE, BUFFER_PTR /* is current block the last one? */
536- je _loop
547+ test BLOCKS_CTR, BLOCKS_CTR
548+ jz _loop
537549
538550 mov TB, B
539551
@@ -575,10 +587,10 @@ _loop2:
575587 .set j, j+2
576588 .endr
577589
578- add $(2* 64 ), BUFFER_PTR2 /* move to next even-64-byte block */
579-
580- cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */
581- cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
590+ /* update counter */
591+ sub $1 , BLOCKS_CTR
592+ /* Move to the next block only if needed */
593+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4 , 128
582594
583595 jmp _loop3
584596_loop3:
@@ -641,19 +653,12 @@ _loop3:
641653
642654 avx2_zeroupper
643655
644- lea K_XMM_AR(%rip ), K_BASE
645-
656+ /* Setup initial values */
646657 mov CTX, HASH_PTR
647658 mov BUF, BUFFER_PTR
648- lea 64 (BUF), BUFFER_PTR2
649-
650- shl $6 , CNT /* mul by 64 */
651- add BUF, CNT
652- add $64 , CNT
653- mov CNT, BUFFER_END
654659
655- cmp BUFFER_END , BUFFER_PTR2
656- cmovae K_BASE, BUFFER_PTR2
660+ mov BUF , BUFFER_PTR2
661+ mov CNT, BLOCKS_CTR
657662
658663 xmm_mov BSWAP_SHUFB_CTL(%rip ), YMM_SHUFB_BSWAP
659664
0 commit comments