igzip: Add avx2 optimized adler32 checksum

Change-Id: I019a38cf98836e3e6c7215a6914b85abb9399e33 Signed-off-by: Greg Tucker <greg.b.tucker@intel.com>
2017-05-05 18:17:15 -07:00 · 2017-05-05 18:17:15 -07:00 · 3025e83b91
parent f4a5b303e2
commit 3025e83b91
7 changed files with 315 additions and 9 deletions
--- a/igzip/Makefile.am
+++ b/igzip/Makefile.am
@ -49,6 +49,7 @@ lsrc_x86_64 +=  \
 		igzip/igzip_icf_finish.asm \
 		igzip/rfc1951_lookup.asm \
 		igzip/crc32_gzip.asm igzip/detect_repeated_char.asm \
+		igzip/adler32_avx2_4.asm \
 		igzip/igzip_multibinary.asm \
 		igzip/igzip_update_histogram_01.asm \
 		igzip/igzip_update_histogram_04.asm \
--- a/igzip/adler32_avx2_4.asm
+++ b/igzip/adler32_avx2_4.asm
@ -0,0 +1,292 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len)
+
+%define LIMIT 5552
+%define BASE  0xFFF1 ; 65521
+
+%define CHUNKSIZE 16
+%define CHUNKSIZE_M1 (CHUNKSIZE-1)
+
+%include "reg_sizes.asm"
+
+default rel
+[bits 64]
+
+; need to keep free: eax, ecx, edx
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg1   rdi
+ %define arg2   rsi
+ %define arg3   rdx
+
+ %define init_d edi
+ %define data   r9
+ %define size   r10
+ %define s      r11
+ %define a_d    r12d
+ %define b_d    r8d
+ %define end    r13
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1   rcx
+ %define arg2   rdx
+ %define arg3   r8
+
+ %define init_d r12d
+ %define data   r9
+ %define size	r10
+ %define s	r11
+ %define a_d	esi
+ %define b_d	edi
+ %define end	r13
+
+ %define stack_size  2*16 + 5*8		; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqa	[rsp + 0*16], xmm6
+	vmovdqa	[rsp + 1*16], xmm7
+	save_reg	rdi,  2*16 + 0*8
+	save_reg	rsi,  2*16 + 1*8
+	save_reg	r12,  2*16 + 2*8
+	save_reg	r13,  2*16 + 3*8
+	end_prolog
+	mov	init_d, ecx	; initalize init_d from arg1 to keep ecx free
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	mov	rdi,  [rsp + 2*16 + 0*8]
+	mov	rsi,  [rsp + 2*16 + 1*8]
+	mov	r12,  [rsp + 2*16 + 2*8]
+	mov	r13,  [rsp + 2*16 + 3*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define ya	ymm0
+%define yb	ymm1
+%define ydata0	ymm2
+%define ydata1	ymm3
+%define ysa	ymm4
+%define ydata   ysa
+%define ytmp0   ydata0
+%define ytmp1   ydata1
+%define ytmp2   ymm5
+%define xa	xmm0
+%define xb      xmm1
+%define xtmp0   xmm2
+%define xtmp1   xmm3
+%define xsa     xmm4
+%define xtmp2   xmm5
+%define yshuf0	ymm6
+%define yshuf1	ymm7
+
+
+global adler32_avx2_4:function
+func(adler32_avx2_4)
+	FUNC_SAVE
+
+	vmovdqa	yshuf0, [SHUF0]
+	vmovdqa	yshuf1, [SHUF1]
+
+	mov	data, arg2
+	mov	size, arg3
+
+	mov	b_d, init_d
+	shr	b_d, 16
+	and	init_d, 0xFFFF
+	cmp	size, 32
+	jb	.lt64
+	vmovd	xa, init_d
+	vpxor	yb, yb, yb
+.sloop1:
+	mov	s, LIMIT
+	cmp	s, size
+	cmova	s, size		; s = min(size, LIMIT)
+	lea	end, [data + s - CHUNKSIZE_M1]
+	cmp	data, end
+	jae	.skip_loop_1a
+align 32
+.sloop1a:
+	; do CHUNKSIZE adds
+	vbroadcastf128	ydata, [data]
+	add	data, CHUNKSIZE
+	vpshufb	ydata0, ydata, yshuf0
+	vpaddd	ya, ya, ydata0
+	vpaddd	yb, yb, ya
+	vpshufb	ydata1, ydata, yshuf1
+	vpaddd	ya, ya, ydata1
+	vpaddd	yb, yb, ya
+	cmp	data, end
+	jb	.sloop1a
+
+.skip_loop_1a:
+	add	end, CHUNKSIZE_M1
+
+	test	s, CHUNKSIZE_M1
+	jnz	.do_final
+
+	; either we're done, or we just did LIMIT
+	sub	size, s
+
+	; reduce
+	vpslld	yb, 3   ; b is scaled by 8
+	vpmulld	ysa, ya, [A_SCALE] ; scaled a
+
+	; compute horizontal sums of ya, yb, ysa
+	vextracti128 xtmp0, ya, 1
+	vextracti128 xtmp1, yb, 1
+	vextracti128 xtmp2, ysa, 1
+	vpaddd	xa, xa, xtmp0
+	vpaddd	xb, xb, xtmp1
+	vpaddd	xsa, xsa, xtmp2
+	vphaddd	xa, xa, xa
+	vphaddd	xb, xb, xb
+	vphaddd	xsa, xsa, xsa
+	vphaddd	xa, xa, xa
+	vphaddd	xb, xb, xb
+	vphaddd	xsa, xsa, xsa
+
+	vmovd	eax, xa
+	xor	edx, edx
+	mov	ecx, BASE
+	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
+	mov	a_d, edx
+
+	vpsubd	xb, xb, xsa
+	vmovd	eax, xb
+	add	eax, b_d
+	xor	edx, edx
+	mov	ecx, BASE
+	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
+	mov	b_d, edx
+
+	test	size, size
+	jz	.finish
+
+	; continue loop
+	vmovd	xa, a_d
+	vpxor	yb, yb
+	jmp	.sloop1
+
+.finish:
+	mov	eax, b_d
+	shl	eax, 16
+	or	eax, a_d
+	jmp	.end
+
+.lt64:
+	mov	a_d, init_d
+	lea	end, [data + size]
+	test	size, size
+	jnz	.final_loop
+	jmp	.zero_size
+
+	; handle remaining 1...15 bytes
+.do_final:
+	; reduce
+	vpslld	yb, 3   ; b is scaled by 8
+	vpmulld	ysa, ya, [A_SCALE] ; scaled a
+
+	vextracti128 xtmp0, ya, 1
+	vextracti128 xtmp1, yb, 1
+	vextracti128 xtmp2, ysa, 1
+	vpaddd	xa, xa, xtmp0
+	vpaddd	xb, xb, xtmp1
+	vpaddd	xsa, xsa, xtmp2
+	vphaddd	xa, xa, xa
+	vphaddd	xb, xb, xb
+	vphaddd	xsa, xsa, xsa
+	vphaddd	xa, xa, xa
+	vphaddd	xb, xb, xb
+	vphaddd	xsa, xsa, xsa
+	vpsubd	xb, xb, xsa
+
+	vmovd	a_d, xa
+	vmovd	eax, xb
+	add	b_d, eax
+
+align 32
+.final_loop:
+	movzx	eax, byte[data]
+	add	a_d, eax
+	inc	data
+	add	b_d, a_d
+	cmp	data, end
+	jb	.final_loop
+
+.zero_size:
+	mov	eax, a_d
+	xor	edx, edx
+	mov	ecx, BASE
+	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
+	mov	a_d, edx
+
+	mov	eax, b_d
+	xor	edx, edx
+	mov	ecx, BASE
+	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
+	shl	edx, 16
+	or	edx, a_d
+	mov	eax, edx
+
+.end:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+align 32
+A_SCALE:
+	dq	0x0000000100000000, 0x0000000300000002
+	dq	0x0000000500000004, 0x0000000700000006
+SHUF0:
+	dq	0xFFFFFF01FFFFFF00, 0xFFFFFF03FFFFFF02
+	dq	0xFFFFFF05FFFFFF04, 0xFFFFFF07FFFFFF06
+SHUF1:
+	dq	0xFFFFFF09FFFFFF08, 0xFFFFFF0BFFFFFF0A
+	dq	0xFFFFFF0DFFFFFF0C, 0xFFFFFF0FFFFFFF0E
+
--- a/igzip/crc32_gzip_base.c
+++ b/igzip/crc32_gzip_base.c
@ -114,10 +114,6 @@ uint32_t adler32_base(uint32_t adler32, uint8_t * start, uint32_t length)
 	A = adler32 & 0xffff;
 	B = adler32 >> 16;

-	/* Internally the checksum is being stored as B | (A-1) so crc and
-	 * addler have same init value */
-	A += 1;
-
 	while (length > MAX_ADLER_BUF) {
 		end = next + MAX_ADLER_BUF;
 		for (; next < end; next++) {
@ -136,8 +132,6 @@ uint32_t adler32_base(uint32_t adler32, uint8_t * start, uint32_t length)
 		B += A;
 	}

-	A -= 1;
-
 	A = A % ADLER_MOD;
 	B = B % ADLER_MOD;

--- a/igzip/igzip.c
+++ b/igzip/igzip.c
@ -127,6 +127,23 @@ struct slver isal_deflate_set_hufftables_slver = { 0x008b, 0x01, 0x00 };

 /*****************************************************************/

+// isal_adler32_bam1 - adler with (B | A minus 1) storage
+
+uint32_t isal_adler32_bam1(uint32_t adler32, const unsigned char *start, uint64_t length)
+{
+	uint64_t a;
+
+	/* Internally the checksum is being stored as B | (A-1) so crc and
+	 * addler have same init value */
+	a = adler32 & 0xffff;
+	a = (a == ADLER_MOD - 1) ? 0 : a + 1;
+	adler32 = isal_adler32((adler32 & 0xffff0000) | a, start, length);
+	a = (adler32 & 0xffff);
+	a = (a == 0) ? ADLER_MOD - 1 : a - 1;
+
+	return (adler32 & 0xffff0000) | a;
+}
+
 static void update_checksum(struct isal_zstream *stream, uint8_t * start_in, uint64_t length)
 {
 	struct isal_zstate *state = &stream->internal_state;
@ -137,7 +154,7 @@ static void update_checksum(struct isal_zstream *stream, uint8_t * start_in, uin
 		break;
 	case IGZIP_ZLIB:
 	case IGZIP_ZLIB_NO_HDR:
-		state->crc = isal_adler32(state->crc, start_in, length);
+		state->crc = isal_adler32_bam1(state->crc, start_in, length);
 		break;
 	}
 }
--- a/igzip/igzip_checksums.h
+++ b/igzip/igzip_checksums.h
@ -8,5 +8,6 @@

 uint32_t crc32_gzip(uint32_t init_crc, const unsigned char *buf, uint64_t len);
 uint32_t isal_adler32(uint32_t init_crc, const unsigned char *buf, uint64_t len);
+uint32_t isal_adler32_bam1(uint32_t init_crc, const unsigned char *buf, uint64_t len);

 #endif
--- a/igzip/igzip_inflate.c
+++ b/igzip/igzip_inflate.c
@ -107,7 +107,7 @@ static void update_checksum(struct inflate_state *state, uint8_t * start_in, uin
 		break;
 	case ISAL_ZLIB:
 	case ISAL_ZLIB_NO_HDR:
-		state->crc = isal_adler32(state->crc, start_in, length);
+		state->crc = isal_adler32_bam1(state->crc, start_in, length);
 		break;
 	}
 }
--- a/igzip/igzip_multibinary.asm
+++ b/igzip/igzip_multibinary.asm
@ -68,6 +68,7 @@ extern crc32_gzip_base
 extern crc32_gzip_01

 extern adler32_base
+extern adler32_avx2_4

 section .text

@ -98,4 +99,4 @@ mbin_interface		crc32_gzip
 mbin_dispatch_init5	crc32_gzip, crc32_gzip_base, crc32_gzip_base, crc32_gzip_01, crc32_gzip_01

 mbin_interface		isal_adler32
-mbin_dispatch_init5	isal_adler32, adler32_base, adler32_base, adler32_base, adler32_base
+mbin_dispatch_init5	isal_adler32, adler32_base, adler32_base, adler32_base, adler32_avx2_4