@@ -193,15 +193,16 @@ AES_ENTRY(aes_cbc_encrypt)
193193 cbz w6 , .Lcbcencloop
194194
195195 ld1 {v0.16b} , [ x5 ] / * get iv * /
196- enc_prepare w3 , x2 , x5
196+ enc_prepare w3 , x2 , x6
197197
198198.Lcbcencloop:
199199 ld1 {v1.16b} , [ x1 ], # 16 / * get next pt block * /
200200 eor v0.16b , v0.16b , v1.16b / * .. and xor with iv * /
201- encrypt_block v0 , w3 , x2 , x5 , w6
201+ encrypt_block v0 , w3 , x2 , x6 , w7
202202 st1 {v0.16b} , [ x0 ], # 16
203203 subs w4 , w4 , # 1
204204 bne .Lcbcencloop
205+ st1 {v0.16b} , [ x5 ] / * return iv * /
205206 ret
206207AES_ENDPROC(aes_cbc_encrypt)
207208
@@ -211,7 +212,7 @@ AES_ENTRY(aes_cbc_decrypt)
211212 cbz w6 , .LcbcdecloopNx
212213
213214 ld1 {v7.16b} , [ x5 ] / * get iv * /
214- dec_prepare w3 , x2 , x5
215+ dec_prepare w3 , x2 , x6
215216
216217.LcbcdecloopNx:
217218#if INTERLEAVE >= 2
@@ -248,14 +249,15 @@ AES_ENTRY(aes_cbc_decrypt)
248249.Lcbcdecloop:
249250 ld1 {v1.16b} , [ x1 ], # 16 / * get next ct block * /
250251 mov v0.16b , v1.16b / * ... and copy to v0 * /
251- decrypt_block v0 , w3 , x2 , x5 , w6
252+ decrypt_block v0 , w3 , x2 , x6 , w7
252253 eor v0.16b , v0.16b , v7.16b / * xor with iv => pt * /
253254 mov v7.16b , v1.16b / * ct is next iv * /
254255 st1 {v0.16b} , [ x0 ], # 16
255256 subs w4 , w4 , # 1
256257 bne .Lcbcdecloop
257258.Lcbcdecout:
258259 FRAME_POP
260+ st1 {v7.16b} , [ x5 ] / * return iv * /
259261 ret
260262AES_ENDPROC(aes_cbc_decrypt)
261263
@@ -267,36 +269,27 @@ AES_ENDPROC(aes_cbc_decrypt)
267269
268270AES_ENTRY(aes_ctr_encrypt)
269271 FRAME_PUSH
270- cbnz w6 , .Lctrfirst / * 1st time around? * /
271- umov x5 , v4.d [ 1 ] / * keep swabbed ctr in reg * /
272- rev x5 , x5
273- #if INTERLEAVE >= 2
274- cmn w5 , w4 / * 32 bit overflow? * /
275- bcs .Lctrinc
276- add x5 , x5 , # 1 / * increment BE ctr * /
277- b .LctrincNx
278- #else
279- b .Lctrinc
280- #endif
281- .Lctrfirst:
272+ cbz w6 , .Lctrnotfirst / * 1st time around? * /
282273 enc_prepare w3 , x2 , x6
283274 ld1 {v4.16b} , [ x5 ]
284- umov x5 , v4.d [ 1 ] / * keep swabbed ctr in reg * /
285- rev x5 , x5
275+
276+ .Lctrnotfirst:
277+ umov x8 , v4.d [ 1 ] / * keep swabbed ctr in reg * /
278+ rev x8 , x8
286279#if INTERLEAVE >= 2
287- cmn w5 , w4 / * 32 bit overflow? * /
280+ cmn w8 , w4 / * 32 bit overflow? * /
288281 bcs .Lctrloop
289282.LctrloopNx:
290283 subs w4 , w4 , #INTERLEAVE
291284 bmi .Lctr1x
292285#if INTERLEAVE == 2
293286 mov v0.8b , v4.8b
294287 mov v1.8b , v4.8b
295- rev x7 , x5
296- add x5 , x5 , # 1
288+ rev x7 , x8
289+ add x8 , x8 , # 1
297290 ins v0.d [ 1 ], x7
298- rev x7 , x5
299- add x5 , x5 , # 1
291+ rev x7 , x8
292+ add x8 , x8 , # 1
300293 ins v1.d [ 1 ], x7
301294 ld1 {v2.16b - v3.16b} , [ x1 ], # 32 / * get 2 input blocks * /
302295 do_encrypt_block2x
@@ -305,7 +298,7 @@ AES_ENTRY(aes_ctr_encrypt)
305298 st1 {v0.16b - v1.16b} , [ x0 ], # 32
306299#else
307300 ldr q8 , = 0x30000000200000001 / * addends 1 , 2 , 3 [, 0 ] * /
308- dup v7.4s , w5
301+ dup v7.4s , w8
309302 mov v0.16b , v4.16b
310303 add v7.4s , v7.4s , v8.4s
311304 mov v1.16b , v4.16b
@@ -323,49 +316,52 @@ AES_ENTRY(aes_ctr_encrypt)
323316 eor v2.16b , v7.16b , v2.16b
324317 eor v3.16b , v5.16b , v3.16b
325318 st1 {v0.16b - v3.16b} , [ x0 ], # 64
326- add x5 , x5 , #INTERLEAVE
319+ add x8 , x8 , #INTERLEAVE
327320#endif
328- cbz w4 , .LctroutNx
329- .LctrincNx:
330- rev x7 , x5
321+ rev x7 , x8
331322 ins v4.d [ 1 ], x7
323+ cbz w4 , .Lctrout
332324 b .LctrloopNx
333- .LctroutNx:
334- sub x5 , x5 , # 1
335- rev x7 , x5
336- ins v4.d [ 1 ], x7
337- b .Lctrout
338325.Lctr1x:
339326 adds w4 , w4 , #INTERLEAVE
340327 beq .Lctrout
341328#endif
342329.Lctrloop:
343330 mov v0.16b , v4.16b
344331 encrypt_block v0 , w3 , x2 , x6 , w7
332+
333+ adds x8 , x8 , # 1 / * increment BE ctr * /
334+ rev x7 , x8
335+ ins v4.d [ 1 ], x7
336+ bcs .Lctrcarry / * overflow? * /
337+
338+ .Lctrcarrydone:
345339 subs w4 , w4 , # 1
346340 bmi .Lctrhalfblock / * blocks < 0 means 1 / 2 block * /
347341 ld1 {v3.16b} , [ x1 ], # 16
348342 eor v3.16b , v0.16b , v3.16b
349343 st1 {v3.16b} , [ x0 ], # 16
350- beq .Lctrout
351- .Lctrinc:
352- adds x5 , x5 , # 1 / * increment BE ctr * /
353- rev x7 , x5
354- ins v4.d [ 1 ], x7
355- bcc .Lctrloop / * no overflow? * /
356- umov x7 , v4.d [ 0 ] / * load upper word of ctr * /
357- rev x7 , x7 / * ... to handle the carry * /
358- add x7 , x7 , # 1
359- rev x7 , x7
360- ins v4.d [ 0 ], x7
361- b .Lctrloop
344+ bne .Lctrloop
345+
346+ .Lctrout:
347+ st1 {v4.16b} , [ x5 ] / * return next CTR value * /
348+ FRAME_POP
349+ ret
350+
362351.Lctrhalfblock:
363352 ld1 {v3.8b} , [ x1 ]
364353 eor v3.8b , v0.8b , v3.8b
365354 st1 {v3.8b} , [ x0 ]
366- .Lctrout:
367355 FRAME_POP
368356 ret
357+
358+ .Lctrcarry:
359+ umov x7 , v4.d [ 0 ] / * load upper word of ctr * /
360+ rev x7 , x7 / * ... to handle the carry * /
361+ add x7 , x7 , # 1
362+ rev x7 , x7
363+ ins v4.d [ 0 ], x7
364+ b .Lctrcarrydone
369365AES_ENDPROC(aes_ctr_encrypt)
370366 .ltorg
371367
0 commit comments