Add raid unit

New raid unit adds source for optimized xor and P+Q functions. Signed-off-by: Greg Tucker <greg.b.tucker@intel.com>
2016-04-26 15:55:12 -07:00 · 2016-04-26 15:55:12 -07:00 · d6c5e9620d
parent fce681adb4
commit d6c5e9620d
24 changed files with 4263 additions and 5 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -25,6 +25,7 @@ perf_tests32=
 # Include units

 include erasure_code/Makefile.am
+include raid/Makefile.am

 # LIB version info not necessarily the same as package version
 LIBISAL_CURRENT=2
--- a/Makefile.nmake
+++ b/Makefile.nmake
@ -27,9 +27,10 @@
 #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ########################################################################

-objs = bin\ec_base.obj bin\ec_highlevel_func.obj bin\ec_multibinary.obj bin\gf_2vect_dot_prod_avx.obj bin\gf_2vect_dot_prod_avx2.obj bin\gf_2vect_dot_prod_avx512.obj bin\gf_2vect_dot_prod_sse.obj bin\gf_2vect_mad_avx.obj bin\gf_2vect_mad_avx2.obj bin\gf_2vect_mad_avx512.obj bin\gf_2vect_mad_sse.obj bin\gf_3vect_dot_prod_avx.obj bin\gf_3vect_dot_prod_avx2.obj bin\gf_3vect_dot_prod_avx512.obj bin\gf_3vect_dot_prod_sse.obj bin\gf_3vect_mad_avx.obj bin\gf_3vect_mad_avx2.obj bin\gf_3vect_mad_avx512.obj bin\gf_3vect_mad_sse.obj bin\gf_4vect_dot_prod_avx.obj bin\gf_4vect_dot_prod_avx2.obj bin\gf_4vect_dot_prod_avx512.obj bin\gf_4vect_dot_prod_sse.obj bin\gf_4vect_mad_avx.obj bin\gf_4vect_mad_avx2.obj bin\gf_4vect_mad_avx512.obj bin\gf_4vect_mad_sse.obj bin\gf_5vect_dot_prod_avx.obj bin\gf_5vect_dot_prod_avx2.obj bin\gf_5vect_dot_prod_sse.obj bin\gf_5vect_mad_avx.obj bin\gf_5vect_mad_avx2.obj bin\gf_5vect_mad_sse.obj bin\gf_6vect_dot_prod_avx.obj bin\gf_6vect_dot_prod_avx2.obj bin\gf_6vect_dot_prod_sse.obj bin\gf_6vect_mad_avx.obj bin\gf_6vect_mad_avx2.obj bin\gf_6vect_mad_sse.obj bin\gf_vect_dot_prod_avx.obj bin\gf_vect_dot_prod_avx2.obj bin\gf_vect_dot_prod_avx512.obj bin\gf_vect_dot_prod_sse.obj bin\gf_vect_mad_avx.obj bin\gf_vect_mad_avx2.obj bin\gf_vect_mad_avx512.obj bin\gf_vect_mad_sse.obj bin\gf_vect_mul_avx.obj bin\gf_vect_mul_sse.obj

-INCLUDES  = -I./ -Ierasure_code/ -Iinclude/
+objs = bin\ec_base.obj bin\ec_highlevel_func.obj bin\ec_multibinary.obj bin\gf_2vect_dot_prod_avx.obj bin\gf_2vect_dot_prod_avx2.obj bin\gf_2vect_dot_prod_avx512.obj bin\gf_2vect_dot_prod_sse.obj bin\gf_2vect_mad_avx.obj bin\gf_2vect_mad_avx2.obj bin\gf_2vect_mad_avx512.obj bin\gf_2vect_mad_sse.obj bin\gf_3vect_dot_prod_avx.obj bin\gf_3vect_dot_prod_avx2.obj bin\gf_3vect_dot_prod_avx512.obj bin\gf_3vect_dot_prod_sse.obj bin\gf_3vect_mad_avx.obj bin\gf_3vect_mad_avx2.obj bin\gf_3vect_mad_avx512.obj bin\gf_3vect_mad_sse.obj bin\gf_4vect_dot_prod_avx.obj bin\gf_4vect_dot_prod_avx2.obj bin\gf_4vect_dot_prod_avx512.obj bin\gf_4vect_dot_prod_sse.obj bin\gf_4vect_mad_avx.obj bin\gf_4vect_mad_avx2.obj bin\gf_4vect_mad_avx512.obj bin\gf_4vect_mad_sse.obj bin\gf_5vect_dot_prod_avx.obj bin\gf_5vect_dot_prod_avx2.obj bin\gf_5vect_dot_prod_sse.obj bin\gf_5vect_mad_avx.obj bin\gf_5vect_mad_avx2.obj bin\gf_5vect_mad_sse.obj bin\gf_6vect_dot_prod_avx.obj bin\gf_6vect_dot_prod_avx2.obj bin\gf_6vect_dot_prod_sse.obj bin\gf_6vect_mad_avx.obj bin\gf_6vect_mad_avx2.obj bin\gf_6vect_mad_sse.obj bin\gf_vect_dot_prod_avx.obj bin\gf_vect_dot_prod_avx2.obj bin\gf_vect_dot_prod_avx512.obj bin\gf_vect_dot_prod_sse.obj bin\gf_vect_mad_avx.obj bin\gf_vect_mad_avx2.obj bin\gf_vect_mad_avx512.obj bin\gf_vect_mad_sse.obj bin\gf_vect_mul_avx.obj bin\gf_vect_mul_sse.obj bin\pq_check_sse.obj bin\pq_gen_avx.obj bin\pq_gen_avx2.obj bin\pq_gen_sse.obj bin\raid_base.obj bin\raid_multibinary.obj bin\xor_check_sse.obj bin\xor_gen_avx.obj bin\xor_gen_sse.obj
+
+INCLUDES  = -I./ -Ierasure_code/ -Iraid/ -Iinclude/
 LINKFLAGS = /nologo
 CFLAGS   = -O2 -D NDEBUG /nologo -D_USE_MATH_DEFINES -Qstd=c99 $(INCLUDES) $(D)
 AFLAGS   = -f win64 $(INCLUDES) $(D)
@ -53,13 +54,24 @@ isa-l.dll: $(objs)
 {erasure_code}.asm.obj:
 	$(AS) $(AFLAGS) -o $@ $?

+{raid}.c.obj:
+	$(CC) $(CFLAGS) /c -Fo$@ $?
+{raid}.asm.obj:
+	$(AS) $(AFLAGS) -o $@ $?


+# Examples
+ex = xor_example.exe
+ex: lib $(ex)
+
+$(ex): $(@B).obj
+
 .obj.exe:
 	link /out:$@ $(LINKFLAGS) isa-l.lib $?

 # Check tests
-checks = erasure_code_test.exe erasure_code_update_test.exe gf_inverse_test.exe gf_vect_mul_test.exe
+checks = erasure_code_test.exe erasure_code_update_test.exe gf_inverse_test.exe gf_vect_mul_test.exe \
+	pq_check_test.exe pq_gen_test.exe xor_check_test.exe xor_gen_test.exe

 checks: lib $(checks)
 $(checks): $(@B).obj
@ -73,7 +85,7 @@ tests: lib $(tests)
 $(tests): $(@B).obj

 # Performance tests
-perfs = erasure_code_base_perf.exe erasure_code_perf.exe erasure_code_sse_perf.exe erasure_code_update_perf.exe gf_2vect_dot_prod_sse_perf.exe gf_3vect_dot_prod_sse_perf.exe gf_4vect_dot_prod_sse_perf.exe gf_5vect_dot_prod_sse_perf.exe gf_6vect_dot_prod_sse_perf.exe gf_vect_dot_prod_1tbl.exe gf_vect_dot_prod_avx_perf.exe gf_vect_dot_prod_perf.exe gf_vect_dot_prod_sse_perf.exe gf_vect_mad_perf.exe gf_vect_mul_avx_perf.exe gf_vect_mul_perf.exe gf_vect_mul_sse_perf.exe
+perfs = erasure_code_base_perf.exe erasure_code_perf.exe erasure_code_sse_perf.exe erasure_code_update_perf.exe gf_2vect_dot_prod_sse_perf.exe gf_3vect_dot_prod_sse_perf.exe gf_4vect_dot_prod_sse_perf.exe gf_5vect_dot_prod_sse_perf.exe gf_6vect_dot_prod_sse_perf.exe gf_vect_dot_prod_1tbl.exe gf_vect_dot_prod_avx_perf.exe gf_vect_dot_prod_perf.exe gf_vect_dot_prod_sse_perf.exe gf_vect_mad_perf.exe gf_vect_mul_avx_perf.exe gf_vect_mul_perf.exe gf_vect_mul_sse_perf.exe pq_gen_perf.exe xor_gen_perf.exe

 perfs: lib $(perfs)
 $(perfs): $(@B).obj
--- a/Makefile.unx
+++ b/Makefile.unx
@ -27,7 +27,7 @@
 #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ########################################################################

-units     = erasure_code
+units     = erasure_code raid

 default: lib

--- a/include/raid.h
+++ b/include/raid.h
@ -0,0 +1,302 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#ifndef _RAID_H_
+#define _RAID_H_
+
+/**
+ *  @file  raid.h
+ *  @brief Interface to RAID functions - XOR and P+Q calculation.
+ *
+ *  This file defines the interface to optimized XOR calculation (RAID5) or P+Q
+ *  dual parity (RAID6).  Operations are carried out on an array of pointers to
+ *  sources and output arrays.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Multi-binary functions */
+
+/**
+ * @brief Generate XOR parity vector from N sources, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes.
+ * @param array   Array of pointers to source and dest. For XOR the dest is
+ *                the last pointer. ie array[vects-1]. Src and dest
+ *                pointers must be aligned to 32B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_gen(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array has XOR parity sum of 0 across all vectors, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param vects   Number of vectors in array.
+ * @param len     Length of each vector in bytes.
+ * @param array   Array of pointers to vectors. Src and dest pointers
+ *                must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_check(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate P+Q parity vectors from N sources, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes. Must be 32B aligned.
+ * @param array   Array of pointers to source and dest. For P+Q the dest
+ *                is the last two pointers. ie array[vects-2],
+ *                array[vects-1].  P and Q parity vectors are
+ *                written to these last two pointers. Src and dest
+ *                pointers must be aligned to 32B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_gen(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array of N sources, P and Q are consistent across all vectors, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param vects  Number of vectors in array including P&Q.
+ * @param len    Length of each vector in bytes. Must be 16B aligned.
+ * @param array  Array of pointers to source and P, Q. P and Q parity
+ *               are assumed to be the last two pointers in the array.
+ *               All pointers must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_check(int vects, int len, void **array);
+
+
+/* Arch specific versions */
+
+/**
+ * @brief Generate XOR parity vector from N sources.
+ * @requires SSE4.1
+ *
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes.
+ * @param array   Array of pointers to source and dest. For XOR the dest is
+ *                the last pointer. ie array[vects-1]. Src and dest pointers
+ *                must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_gen_sse(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate XOR parity vector from N sources.
+ * @requires AVX
+ *
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes.
+ * @param array   Array of pointers to source and dest. For XOR the dest is
+ *                the last pointer. ie array[vects-1]. Src and dest pointers
+ *                must be aligned to 32B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_gen_avx(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array has XOR parity sum of 0 across all vectors.
+ * @requires SSE4.1
+ *
+ * @param vects   Number of vectors in array.
+ * @param len     Length of each vector in bytes.
+ * @param array   Array of pointers to vectors. Src and dest pointers
+ *                must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_check_sse(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate P+Q parity vectors from N sources.
+ * @requires SSE4.1
+ *
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes. Must be 16B aligned.
+ * @param array   Array of pointers to source and dest. For P+Q the dest
+ *                is the last two pointers. ie array[vects-2],
+ *                array[vects-1]. P and Q parity vectors are
+ *                written to these last two pointers. Src and dest
+ *                pointers must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_gen_sse(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate P+Q parity vectors from N sources.
+ * @requires AVX
+ *
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes. Must be 16B aligned.
+ * @param array   Array of pointers to source and dest. For P+Q the dest
+ *                is the last two pointers. ie array[vects-2],
+ *                array[vects-1]. P and Q parity vectors are
+ *                written to these last two pointers. Src and dest
+ *                pointers must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_gen_avx(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate P+Q parity vectors from N sources.
+ * @requires AVX2
+ *
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes. Must be 32B aligned.
+ * @param array   Array of pointers to source and dest. For P+Q the dest
+ *                is the last two pointers. ie array[vects-2],
+ *                array[vects-1]. P and Q parity vectors are
+ *                written to these last two pointers. Src and dest
+ *                pointers must be aligned to 32B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_gen_avx2(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array of N sources, P and Q are consistent across all vectors.
+ * @requires SSE4.1
+ *
+ * @param vects  Number of vectors in array including P&Q.
+ * @param len    Length of each vector in bytes. Must be 16B aligned.
+ * @param array  Array of pointers to source and P, Q. P and Q parity
+                 are assumed to be the last two pointers in the array.
+                 All pointers must be aligned to 16B.
+ * @returns 0 pass, other fail
+ */
+
+int pq_check_sse(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate P+Q parity vectors from N sources, runs baseline version.
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes. Must be 16B aligned.
+ * @param array   Array of pointers to source and dest. For P+Q the dest
+ * 		  is the last two pointers. ie array[vects-2],
+ * 		  array[vects-1]. P and Q parity vectors are
+ * 		  written to these last two pointers. Src and dest pointers
+ * 		  must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_gen_base(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate XOR parity vector from N sources, runs baseline version.
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes.
+ * @param array   Array of pointers to source and dest. For XOR the dest is
+ * 		  the last pointer. ie array[vects-1]. Src and dest pointers
+ * 		  must be aligned to 32B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_gen_base(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array has XOR parity sum of 0 across all vectors, runs baseline version.
+ *
+ * @param vects   Number of vectors in array.
+ * @param len     Length of each vector in bytes.
+ * @param array   Array of pointers to vectors. Src and dest pointers
+ *                must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_check_base(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array of N sources, P and Q are consistent across all vectors, runs baseline version.
+ *
+ * @param vects  Number of vectors in array including P&Q.
+ * @param len    Length of each vector in bytes. Must be 16B aligned.
+ * @param array  Array of pointers to source and P, Q. P and Q parity
+ *               are assumed to be the last two pointers in the array.
+ *               All pointers must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_check_base(int vects, int len, void **array);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_RAID_H_
--- a/isa-l.def
+++ b/isa-l.def
@ -54,3 +54,18 @@ gf_vect_mul             @50
 ec_encode_data_update   @51
 gf_vect_dot_prod        @52
 gf_vect_mad             @53
+xor_gen                 @54
+xor_check               @55
+pq_gen                  @56
+pq_check                @57
+xor_gen_sse             @58
+xor_gen_avx             @59
+xor_check_sse           @60
+pq_gen_sse              @61
+pq_gen_avx              @62
+pq_gen_avx2             @63
+pq_check_sse            @64
+pq_gen_base             @65
+xor_gen_base            @66
+xor_check_base          @67
+pq_check_base           @68
--- a/raid/Makefile.am
+++ b/raid/Makefile.am
@ -0,0 +1,45 @@
+########################################################################
+#  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions 
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc        += 	raid/xor_gen_sse.asm raid/pq_gen_sse.asm raid/xor_check_sse.asm \
+		raid/pq_check_sse.asm raid/pq_gen_avx.asm \
+		raid/xor_gen_avx.asm raid/pq_gen_avx2.asm \
+		raid/raid_base.c raid/raid_multibinary.asm
+
+extern_hdrs +=	include/raid.h
+
+other_src   +=  include/test.h include/types.h
+
+check_tests +=  raid/xor_gen_test raid/pq_gen_test raid/xor_check_test raid/pq_check_test
+
+perf_tests  +=  raid/xor_gen_perf raid/pq_gen_perf
+
+examples    += 	raid/xor_example
+
+lsrc32       += xor_gen_sse.asm pq_gen_sse_i32.asm xor_check_sse.asm pq_check_sse_i32.asm raid_base.c
--- a/raid/pq_check_sse.asm
+++ b/raid/pq_check_sse.asm
@ -0,0 +1,277 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_check_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array).  Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes.  Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp3  arg4
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define tmp   r11
+ %define tmp3  r10
+ %define return rax
+ %define stack_size  7*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm15, 6*16
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm15, [rsp + 9*16]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos return
+
+%define xp1   xmm0
+%define xq1   xmm1
+%define xtmp1 xmm2
+%define xs1   xmm3
+
+%define xp2   xmm4
+%define xq2   xmm5
+%define xtmp2 xmm6
+%define xs2   xmm7
+
+%define xp3   xmm8
+%define xq3   xmm9
+%define xtmp3 xmm10
+%define xs3   xmm11
+
+%define xpoly xmm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movdqa
+ %define XSTR movntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+global pq_check_sse:function
+func(pq_check_sse)
+	FUNC_SAVE
+	sub	vec, 3			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (16-1)		;Check alignment of length
+	jnz	return_fail
+	mov	pos, 0
+	movdqa	xpoly, [poly]
+	cmp	len, 48
+	jl	loop16
+
+len_aligned_32bytes:
+	sub	len, 48			;Do end of vec first and run backward
+
+loop48:
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	XLDR	xp1, [ptr+pos]		;Initialize xp1 with P1 src
+	XLDR	xp2, [ptr+pos+16]	;Initialize xp2 with P2 src + 16B ahead
+	XLDR	xp3, [ptr+pos+32]	;Initialize xp3 with P2 src + 32B ahead
+	pxor	xq1, xq1		;q1 = 0
+	pxor	xq2, xq2		;q2 = 0
+	pxor	xq3, xq3		;q3 = 0
+
+	mov 	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	XLDR	xs2, [ptr+pos+16]	;Preload last vector (source)
+	XLDR	xs3, [ptr+pos+32]	;Preload last vector (source)
+
+next_vect:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	pxor	xp1, xs1		; p1 ^= s1
+	pxor	xp2, xs2		; p2 ^= s2
+	pxor	xp3, xs3		; p3 ^= s2
+	pxor	xq1, xs1		; q1 ^= s1
+	pxor	xq2, xs2		; q2 ^= s2
+	pxor	xq3, xs3		; q3 ^= s3
+	pxor	xtmp1, xtmp1		; xtmp1 = 0 - for compare to 0
+	pxor	xtmp2, xtmp2		; xtmp2 = 0
+	pxor	xtmp3, xtmp3		; xtmp3 = 0
+	pcmpgtb	xtmp1, xq1		; xtmp1 = mask 0xff or 0x00 if bit7 set
+	pcmpgtb	xtmp2, xq2		; xtmp2 = mask 0xff or 0x00 if bit7 set
+	pcmpgtb	xtmp3, xq3		; xtmp3 = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp1 = poly or 0x00
+	pand	xtmp2, xpoly		; xtmp2 = poly or 0x00
+	pand	xtmp3, xpoly		; xtmp3 = poly or 0x00
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data1)
+	XLDR	xs2, [ptr+pos+16]	; Get next vector (source data2)
+	XLDR	xs3, [ptr+pos+32]	; Get next vector (source data3)
+	paddb	xq1, xq1		; q1 = q1<<1
+	paddb	xq2, xq2		; q2 = q2<<1
+	paddb	xq3, xq3		; q3 = q3<<1
+	pxor	xq1, xtmp1		; q1 = q1<<1 ^ poly_masked
+	pxor	xq2, xtmp2		; q2 = q2<<1 ^ poly_masked
+	pxor	xq3, xtmp3		; q3 = q3<<1 ^ poly_masked
+	jg	next_vect		; Loop for each vect except 0
+
+	pxor	xp1, xs1		;p1 ^= s1[0] - last source is already loaded
+	pxor	xq1, xs1		;q1 ^= 1 * s1[0]
+	pxor	xp2, xs2		;p2 ^= s2[0]
+	pxor	xq2, xs2		;q2 ^= 1 * s2[0]
+	pxor	xp3, xs3		;p3 ^= s3[0]
+	pxor	xq3, xs3		;q3 ^= 1 * s3[0]
+
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	XLDR	xtmp1, [tmp+pos]	;re-init xq1 with Q1 src
+	XLDR	xtmp2, [tmp+pos+16]	;re-init xq2 with Q2 src + 16B ahead
+	XLDR	xtmp3, [tmp+pos+32]	;re-init xq3 with Q2 src + 32B ahead
+
+	pxor	xq1, xtmp1		;xq1 = q1 calculated ^ q1 saved
+	pxor	xq2, xtmp2
+	pxor	xq3, xtmp3
+
+	por	xp1, xq1		;Confirm that all P&Q parity are 0
+	por	xp1, xp2
+	por	xp1, xq2
+	por	xp1, xp3
+	por	xp1, xq3
+	ptest	xp1, xp1
+	jnz	return_fail
+	add	pos, 48
+	cmp	pos, len
+	jle	loop48
+
+
+	;; ------------------------------
+	;; Do last 16 or 32 Bytes remaining
+	add	len, 48
+	cmp	pos, len
+	je	return_pass
+
+loop16:
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	XLDR	xp1, [ptr+pos]		;Initialize xp1 with P1 src
+	pxor	xq1, xq1		;q = 0
+	mov 	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+
+next_vect16:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	pxor	xq1, xs1		; q ^= s
+	pxor	xtmp1, xtmp1		; xtmp = 0
+	pcmpgtb	xtmp1, xq1		; xtmp = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp = poly or 0x00
+	pxor	xp1, xs1		; p ^= s
+	paddb	xq1, xq1		; q = q<<1
+	pxor	xq1, xtmp1		; q = q<<1 ^ poly_masked
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data)
+	jg	next_vect16		; Loop for each vect except 0
+
+	pxor	xp1, xs1		;p ^= s[0] - last source is already loaded
+	pxor	xq1, xs1		;q ^= 1 * s[0]
+
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	XLDR	xtmp1, [tmp+pos]	;re-init tmp with Q1 src
+	pxor	xq1, xtmp1		;xq1 = q1 calculated ^ q1 saved
+
+	por	xp1, xq1		;Confirm that all P&Q parity are = 0
+	ptest	xp1, xp1
+	jnz	return_fail
+	add	pos, 16
+	cmp	pos, len
+	jl	loop16
+
+
+return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;;       func          core, ver, snum
+slversion pq_check_sse, 00,   06,  0033
--- a/raid/pq_check_sse_i32.asm
+++ b/raid/pq_check_sse_i32.asm
@ -0,0 +1,282 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_gen_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array).  Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes.  Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define PS 8
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define return rax
+ %define PS 8
+ %define tmp   r11
+ %define stack_size  2*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	add	rsp, stack_size
+ %endmacro
+
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0   edx
+ %define arg1   ecx
+ %define return eax
+ %define PS 4
+ %define func(x) x:
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2  edi	; must sav/restore
+ %define arg3  esi
+ %define tmp   ebx
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg0, arg(0)
+	mov	arg1, arg(1)
+	mov	arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp	;if has frame pointer?
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos return
+
+%define xp1   xmm0
+%define xq1   xmm1
+%define xtmp1 xmm2
+%define xs1   xmm3
+
+%define xp2   xmm4
+%define xq2   xmm5
+%define xtmp2 xmm6
+%define xs2   xmm7
+
+%ifidn PS,8			; 64-bit code
+ default rel
+ [bits 64]
+ %define xpoly xmm15
+%elifidn PS,4			; 32-bit code
+ %define xpoly [poly]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+global pq_check_sse:function
+func(pq_check_sse)
+	FUNC_SAVE
+	sub	vec, 3			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (16-1)		;Check alignment of length
+	jnz	return_fail
+	mov	pos, 0
+%ifidn PS,8
+	movdqa	xpoly, [poly]		;For 64-bit, load poly into high xmm reg
+%endif
+	cmp	len, 32
+	jl	loop16
+
+len_aligned_32bytes:
+	sub	len, 32			;Do end of vec first and run backward
+
+loop32:
+	mov 	ptr, [arg2+PS+vec*PS] 	;Get address of P parity vector
+	mov	tmp, [arg2+(2*PS)+vec*PS]	;Get address of Q parity vector
+	XLDR	xp1, [ptr+pos]		;Initialize xp1 with P1 src
+	XLDR	xp2, [ptr+pos+16]	;Initialize xp2 with P2 src + 16B ahead
+	pxor	xq1, xq1		;q1 = 0
+	pxor	xq2, xq2		;q2 = 0
+
+	mov 	ptr, [arg2+vec*PS] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	XLDR	xs2, [ptr+pos+16]	;Preload last vector (source)
+
+next_vect:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*PS] 	; get pointer to next vect
+	pxor	xp1, xs1		; p1 ^= s1
+	pxor	xp2, xs2		; p2 ^= s2
+	pxor	xq1, xs1		; q1 ^= s1
+	pxor	xq2, xs2		; q2 ^= s2
+	pxor	xtmp1, xtmp1		; xtmp1 = 0 - for compare to 0
+	pxor	xtmp2, xtmp2		; xtmp2 = 0
+	pcmpgtb	xtmp1, xq1		; xtmp1 = mask 0xff or 0x00 if bit7 set
+	pcmpgtb	xtmp2, xq2		; xtmp2 = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp1 = poly or 0x00
+	pand	xtmp2, xpoly		; xtmp2 = poly or 0x00
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data1)
+	XLDR	xs2, [ptr+pos+16]	; Get next vector (source data2)
+	paddb	xq1, xq1		; q1 = q1<<1
+	paddb	xq2, xq2		; q2 = q2<<1
+	pxor	xq1, xtmp1		; q1 = q1<<1 ^ poly_masked
+	pxor	xq2, xtmp2		; q2 = q2<<1 ^ poly_masked
+	jg	next_vect		; Loop for each vect except 0
+
+	pxor	xp1, xs1		;p1 ^= s1[0] - last source is already loaded
+	pxor	xq1, xs1		;q1 ^= 1 * s1[0]
+	pxor	xp2, xs2		;p2 ^= s2[0]
+	pxor	xq2, xs2		;q2 ^= 1 * s2[0]
+
+	mov	tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+	XLDR	xtmp1, [tmp+pos]	;re-init xq1 with Q1 src
+	XLDR	xtmp2, [tmp+pos+16]	;re-init xq2 with Q2 src + 16B ahead
+
+	pxor	xq1, xtmp1		;xq1 = q1 calculated ^ q1 saved
+	pxor	xq2, xtmp2
+
+	por	xp1, xq1		;Confirm that all P&Q parity are 0
+	por	xp1, xp2
+	por	xp1, xq2
+	ptest	xp1, xp1
+	jnz	return_fail
+	add	pos, 32
+	cmp	pos, len
+	jle	loop32
+
+
+	;; ------------------------------
+	;; Do last 16 Bytes remaining
+	add	len, 32
+	cmp	pos, len
+	je	return_pass
+
+loop16:
+	mov	ptr, [arg2+PS+vec*PS]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+	XLDR	xp1, [ptr+pos]		;Initialize xp1 with P1 src
+	pxor	xq1, xq1		;q = 0
+	mov 	ptr, [arg2+vec*PS] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+
+next_vect16:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*PS] 	; get pointer to next vect
+	pxor	xq1, xs1		; q ^= s
+	pxor	xtmp1, xtmp1		; xtmp = 0
+	pcmpgtb	xtmp1, xq1		; xtmp = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp = poly or 0x00
+	pxor	xp1, xs1		; p ^= s
+	paddb	xq1, xq1		; q = q<<1
+	pxor	xq1, xtmp1		; q = q<<1 ^ poly_masked
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data)
+	jg	next_vect16		; Loop for each vect except 0
+
+	pxor	xp1, xs1		;p ^= s[0] - last source is already loaded
+	pxor	xq1, xs1		;q ^= 1 * s[0]
+
+	mov	tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+	XLDR	xtmp1, [tmp+pos]	;re-init tmp with Q1 src
+	pxor	xq1, xtmp1		;xq1 = q1 calculated ^ q1 saved
+
+	por	xp1, xq1		;Confirm that all P&Q parity are = 0
+	ptest	xp1, xp1
+	jnz	return_fail
+	add	pos, 16
+	cmp	pos, len
+	jl	loop16
+
+
+return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;;       func          core, ver, snum
+slversion pq_check_sse, 00,   06,  0033
--- a/raid/pq_check_test.c
+++ b/raid/pq_check_test.c
@ -0,0 +1,304 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN     1024
+#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+int ref_multi_pq(int vects, int len, void **array)
+{
+	int i, j;
+	unsigned char p, q, s;
+	unsigned char **src = (unsigned char **)array;
+
+	for (i = 0; i < len; i++) {
+		q = p = src[vects - 3][i];
+
+		for (j = vects - 4; j >= 0; j--) {
+			p ^= s = src[j][i];
+			q = s ^ ((q << 1) ^ ((q & 0x80) ? 0x1d : 0));	// mult by GF{2}
+		}
+
+		src[vects - 2][i] = p;	// second to last pointer is p
+		src[vects - 1][i] = q;	// last pointer is q
+	}
+	return 0;
+}
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, k, ret, fail = 0;
+	void *buffs[TEST_SOURCES + 2];
+	char c;
+	char *tmp_buf[TEST_SOURCES + 2];
+	int serr, lerr;
+
+	printf("Test pq_check_test %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN);
+
+	srand(TEST_SEED);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES + 2; i++) {
+		void *buf;
+		if (posix_memalign(&buf, 16, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return 1;
+		}
+		buffs[i] = buf;
+	}
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES + 2; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	ref_multi_pq(TEST_SOURCES + 2, TEST_LEN, buffs);
+	ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+	if (ret != 0) {
+		fail++;
+		printf("\nfail zero test %d\n", ret);
+	}
+
+	((char *)(buffs[0]))[TEST_LEN - 2] = 0x7;	// corrupt buffer
+	ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+	if (ret == 0) {
+		fail++;
+		printf("\nfail corrupt buffer test %d\n", ret);
+	}
+	((char *)(buffs[0]))[TEST_LEN - 2] = 0;	// un-corrupt buffer
+
+	// Test corrupted buffer any location on all sources
+	for (j = 0; j < TEST_SOURCES + 2; j++) {
+		for (i = TEST_LEN - 1; i >= 0; i--) {
+			((char *)buffs[j])[i] = 0x5;	// corrupt buffer
+			ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+			if (ret == 0) {
+				fail++;
+				printf("\nfail corrupt zero buffer test j=%d, i=%d\n", j, i);
+				return 1;
+			}
+			((char *)buffs[j])[i] = 0;	// un-corrupt buffer
+		}
+		putchar('.');
+	}
+
+	// Test rand1
+	for (i = 0; i < TEST_SOURCES + 2; i++)
+		rand_buffer(buffs[i], TEST_LEN);
+
+	ref_multi_pq(TEST_SOURCES + 2, TEST_LEN, buffs);
+	ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+	if (ret != 0) {
+		fail++;
+		printf("fail first rand test %d\n", ret);
+	}
+
+	c = ((char *)(buffs[0]))[TEST_LEN - 2];
+	((char *)(buffs[0]))[TEST_LEN - 2] = c ^ 0x1;
+	ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+	if (ret == 0) {
+		fail++;
+		printf("\nFail corrupt buffer test, passed when should have failed\n");
+	}
+	((char *)(buffs[0]))[TEST_LEN - 2] = c;	// un-corrupt buffer
+
+	// Test corrupted buffer any location on all sources w/ random data
+	for (j = 0; j < TEST_SOURCES + 2; j++) {
+		for (i = TEST_LEN - 1; i >= 0; i--) {
+			// Check it still passes
+			ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+			if (ret != 0) {	// should pass
+				fail++;
+				printf
+				    ("\nFail rand test with un-corrupted buffer j=%d, i=%d\n",
+				     j, i);
+				return 1;
+			}
+			c = ((char *)buffs[j])[i];
+			((char *)buffs[j])[i] = c ^ 1;	// corrupt buffer
+			ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+			if (ret == 0) {	// Check it now fails
+				fail++;
+				printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i);
+				return 1;
+			}
+			((char *)buffs[j])[i] = c;	// un-corrupt buffer
+		}
+		putchar('.');
+	}
+
+	// Test various number of sources, full length
+	for (j = 4; j <= TEST_SOURCES + 2; j++) {
+		// New random data
+		for (i = 0; i < j; i++)
+			rand_buffer(buffs[i], TEST_LEN);
+
+		// Generate p,q parity for this number of sources
+		ref_multi_pq(j, TEST_LEN, buffs);
+
+		// Set errors up in each source and len position
+		for (i = 0; i < j; i++) {
+			for (k = 0; k < TEST_LEN; k++) {
+				// See if it still passes
+				ret = pq_check(j, TEST_LEN, buffs);
+				if (ret != 0) {	// Should pass
+					printf("\nfail rand fixed len test %d sources\n", j);
+					fail++;
+					return 1;
+				}
+
+				c = ((char *)buffs[i])[k];
+				((char *)buffs[i])[k] = c ^ 1;	// corrupt buffer
+
+				ret = pq_check(j, TEST_LEN, buffs);
+				if (ret == 0) {	// Should fail
+					printf
+					    ("\nfail rand fixed len test corrupted buffer %d sources\n",
+					     j);
+					fail++;
+					return 1;
+				}
+				((char *)buffs[i])[k] = c;	// un-corrupt buffer
+			}
+		}
+		putchar('.');
+	}
+
+	fflush(0);
+
+	// Test various number of sources and len
+	k = 16;
+	while (k <= TEST_LEN) {
+		char *tmp;
+		for (j = 4; j <= TEST_SOURCES + 2; j++) {
+			for (i = 0; i < j; i++)
+				rand_buffer(buffs[i], k);
+
+			// Generate p,q parity for this number of sources
+			ref_multi_pq(j, k, buffs);
+
+			// Inject errors at various source and len positions
+			for (lerr = 0; lerr < k; lerr++) {
+				for (serr = 0; serr < j; serr++) {
+					// See if it still passes
+					ret = pq_check(j, k, buffs);
+					if (ret != 0) {	// Should pass
+						printf
+						    ("\nfail rand var src, len test %d sources, len=%d\n",
+						     j, k);
+						fail++;
+						return 1;
+					}
+
+					tmp = (char *)buffs[serr];
+					c = tmp[lerr];
+					((char *)buffs[serr])[lerr] = c ^ 1;	// corrupt buffer
+
+					ret = pq_check(j, k, buffs);
+					if (ret == 0) {	// Should fail
+						printf
+						    ("\nfail rand var src, len test corrupted buffer "
+						     "%d sources, len=%d, ret=%d\n", j, k,
+						     ret);
+						fail++;
+						return 1;
+					}
+					((char *)buffs[serr])[lerr] = c;	// un-corrupt buffer
+				}
+			}
+			putchar('.');
+			fflush(0);
+		}
+		k += 16;
+	}
+
+	// Test at the end of buffer
+	for (i = 0; i < TEST_LEN; i += 16) {
+		for (j = 0; j < TEST_SOURCES + 2; j++) {
+			rand_buffer(buffs[j], TEST_LEN - i);
+			tmp_buf[j] = (char *)buffs[j] + i;
+		}
+
+		pq_gen_base(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf);
+
+		// Test good data
+		ret = pq_check(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf);
+		if (ret != 0) {
+			printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i);
+			fail++;
+			return 1;
+		}
+		// Test bad data
+		for (serr = 0; serr < TEST_SOURCES + 2; serr++) {
+			for (lerr = 0; lerr < (TEST_LEN - i); lerr++) {
+				c = tmp_buf[serr][lerr];
+				tmp_buf[serr][lerr] = c ^ 1;
+
+				ret =
+				    pq_check(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf);
+				if (ret == 0) {
+					printf("fail end test corrupted buffer - "
+					       "offset: %d, len: %d, ret: %d\n", i,
+					       TEST_LEN - i, ret);
+					fail++;
+					return 1;
+				}
+
+				tmp_buf[serr][lerr] = c;
+			}
+		}
+
+		putchar('.');
+		fflush(0);
+	}
+
+	if (fail == 0)
+		printf("Pass\n");
+
+	return fail;
+
+}
--- a/raid/pq_gen_avx.asm
+++ b/raid/pq_gen_avx.asm
@ -0,0 +1,254 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using AVX
+;;; int pq_gen_avx(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array).  Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes.  Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp3  arg4
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define tmp   r11
+ %define tmp3  r10
+ %define return rax
+ %define stack_size  8*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm14, 6*16
+	save_xmm128	xmm15, 7*16
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm14, [rsp + 6*16]
+	movdqa	xmm15, [rsp + 7*16]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos rax
+
+%define xp1   xmm0
+%define xq1   xmm1
+%define xtmp1 xmm2
+%define xs1   xmm3
+
+%define xp2   xmm4
+%define xq2   xmm5
+%define xtmp2 xmm6
+%define xs2   xmm7
+
+%define xp3   xmm8
+%define xq3   xmm9
+%define xtmp3 xmm10
+%define xs3   xmm11
+
+%define xzero xmm14
+%define xpoly xmm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+%else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+global pq_gen_avx:function
+func(pq_gen_avx)
+	FUNC_SAVE
+	sub	vec, 3			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (16-1)		;Check alignment of length
+	jnz	return_fail
+	mov	pos, 0
+	vmovdqa	xpoly, [poly]
+	vpxor	xzero, xzero, xzero
+	cmp	len, 48
+	jl	loop16
+
+len_aligned_32bytes:
+	sub	len, 48			;Len points to last block
+
+loop48:
+	mov 	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	XLDR	xs2, [ptr+pos+16]	;Preload last vector (source)
+	XLDR	xs3, [ptr+pos+32]	;Preload last vector (source)
+	vpxor	xp1, xp1, xp1		;p1 = 0
+	vpxor	xp2, xp2, xp2		;p2 = 0
+	vpxor	xp3, xp3, xp3		;p3 = 0
+	vpxor	xq1, xq1, xq1		;q1 = 0
+	vpxor	xq2, xq2, xq2		;q2 = 0
+	vpxor	xq3, xq3, xq3		;q3 = 0
+
+next_vect:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	vpxor	xq1, xq1, xs1		; q1 ^= s1
+	vpxor	xq2, xq2, xs2		; q2 ^= s2
+	vpxor	xq3, xq3, xs3		; q3 ^= s3
+	vpxor	xp1, xp1, xs1		; p1 ^= s1
+	vpxor	xp2, xp2, xs2		; p2 ^= s2
+	vpxor	xp3, xp3, xs3		; p3 ^= s2
+	vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+	vpblendvb xtmp2, xzero, xpoly, xq2 ; xtmp2 = poly or 0x00
+	vpblendvb xtmp3, xzero, xpoly, xq3 ; xtmp3 = poly or 0x00
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data1)
+	XLDR	xs2, [ptr+pos+16]	; Get next vector (source data2)
+	XLDR	xs3, [ptr+pos+32]	; Get next vector (source data3)
+	vpaddb	xq1, xq1, xq1		; q1 = q1<<1
+	vpaddb	xq2, xq2, xq2		; q2 = q2<<1
+	vpaddb	xq3, xq3, xq3		; q3 = q3<<1
+	vpxor	xq1, xq1, xtmp1		; q1 = q1<<1 ^ poly_masked
+	vpxor	xq2, xq2, xtmp2		; q2 = q2<<1 ^ poly_masked
+	vpxor	xq3, xq3, xtmp3		; q3 = q3<<1 ^ poly_masked
+	jg	next_vect		; Loop for each vect except 0
+
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	vpxor	xp1, xp1, xs1		;p1 ^= s1[0] - last source is already loaded
+	vpxor	xq1, xq1, xs1		;q1 ^= 1 * s1[0]
+	vpxor	xp2, xp2, xs2		;p2 ^= s2[0]
+	vpxor	xq2, xq2, xs2		;q2 ^= 1 * s2[0]
+	vpxor	xp3, xp3, xs3		;p3 ^= s3[0]
+	vpxor	xq3, xq3, xs3		;q3 ^= 1 * s3[0]
+	XSTR	[ptr+pos], xp1		;Write parity P1 vector
+	XSTR	[ptr+pos+16], xp2	;Write parity P2 vector
+	XSTR	[ptr+pos+32], xp3	;Write parity P3 vector
+	XSTR	[tmp+pos], xq1		;Write parity Q1 vector
+	XSTR	[tmp+pos+16], xq2	;Write parity Q2 vector
+	XSTR	[tmp+pos+32], xq3	;Write parity Q3 vector
+	add	pos, 48
+	cmp	pos, len
+	jle	loop48
+
+	;; ------------------------------
+	;; Do last 16 or 32 Bytes remaining
+	add	len, 48
+	cmp	pos, len
+	je	return_pass
+
+loop16:
+	mov 	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	vpxor	xp1, xp1, xp1		;p = 0
+	vpxor	xq1, xq1, xq1		;q = 0
+
+next_vect16:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	vpxor	xq1, xq1, xs1		; q1 ^= s1
+	vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+	vpxor	xp1, xp1, xs1		; p ^= s
+	vpaddb	xq1, xq1, xq1		; q = q<<1
+	vpxor	xq1, xq1, xtmp1		; q = q<<1 ^ poly_masked
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data)
+	jg	next_vect16		; Loop for each vect except 0
+
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	vpxor	xp1, xp1, xs1		;p ^= s[0] - last source is already loaded
+	vpxor	xq1, xq1, xs1		;q ^= 1 * s[0]
+	XSTR	[ptr+pos], xp1		;Write parity P vector
+	XSTR	[tmp+pos], xq1		;Write parity Q vector
+	add	pos, 16
+	cmp	pos, len
+	jl	loop16
+
+
+return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;;       func        core, ver, snum
+slversion pq_gen_avx, 02,   0a,  0039
--- a/raid/pq_gen_avx2.asm
+++ b/raid/pq_gen_avx2.asm
@ -0,0 +1,256 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using AVX
+;;; int pq_gen_avx(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array).  Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 32 bytes.  Length must be 32 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp3  arg4
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define tmp   r11
+ %define tmp3  r10
+ %define return rax
+ %define stack_size  8*32 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	;; Until a sav_ymm256 is defined
+	vmovdqu	[rsp + 0*32], ymm6
+	vmovdqu	[rsp + 1*32], ymm7
+	vmovdqu	[rsp + 2*32], ymm8
+	vmovdqu	[rsp + 3*32], ymm9
+	vmovdqu	[rsp + 4*32], ymm10
+	vmovdqu	[rsp + 5*32], ymm11
+	vmovdqu	[rsp + 6*32], ymm14
+	vmovdqu	[rsp + 7*32], ymm15
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqu	ymm6, [rsp + 0*32]
+	vmovdqu	ymm7, [rsp + 1*32]
+	vmovdqu	ymm8, [rsp + 2*32]
+	vmovdqu	ymm9, [rsp + 3*32]
+	vmovdqu	ymm10, [rsp + 4*32]
+	vmovdqu	ymm11, [rsp + 5*32]
+	vmovdqu	ymm14, [rsp + 6*32]
+	vmovdqu	ymm15, [rsp + 7*32]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos rax
+
+%define xp1   ymm0
+%define xq1   ymm1
+%define xtmp1 ymm2
+%define xs1   ymm3
+
+%define xp2   ymm4
+%define xq2   ymm5
+%define xtmp2 ymm6
+%define xs2   ymm7
+
+%define xp3   ymm8
+%define xq3   ymm9
+%define xtmp3 ymm10
+%define xs3   ymm11
+
+%define xzero ymm14
+%define xpoly ymm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+%else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+global pq_gen_avx2:function
+func(pq_gen_avx2)
+	FUNC_SAVE
+	sub	vec, 3			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (32-1)		;Check alignment of length
+	jnz	return_fail
+	mov	pos, 0
+	vmovdqa	xpoly, [poly]
+	vpxor	xzero, xzero, xzero
+	cmp	len, 96
+	jl	loop32
+
+len_aligned_32bytes:
+	sub	len, 3*32		;Len points to last block
+
+loop96:
+	mov	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	XLDR	xs2, [ptr+pos+32]	;Preload last vector (source)
+	XLDR	xs3, [ptr+pos+64]	;Preload last vector (source)
+	vpxor	xp1, xp1, xp1		;p1 = 0
+	vpxor	xp2, xp2, xp2		;p2 = 0
+	vpxor	xp3, xp3, xp3		;p3 = 0
+	vpxor	xq1, xq1, xq1		;q1 = 0
+	vpxor	xq2, xq2, xq2		;q2 = 0
+	vpxor	xq3, xq3, xq3		;q3 = 0
+
+next_vect:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	vpxor	xq1, xq1, xs1		; q1 ^= s1
+	vpxor	xq2, xq2, xs2		; q2 ^= s2
+	vpxor	xq3, xq3, xs3		; q3 ^= s3
+	vpxor	xp1, xp1, xs1		; p1 ^= s1
+	vpxor	xp2, xp2, xs2		; p2 ^= s2
+	vpxor	xp3, xp3, xs3		; p3 ^= s2
+	vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+	vpblendvb xtmp2, xzero, xpoly, xq2 ; xtmp2 = poly or 0x00
+	vpblendvb xtmp3, xzero, xpoly, xq3 ; xtmp3 = poly or 0x00
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data1)
+	XLDR	xs2, [ptr+pos+32]	; Get next vector (source data2)
+	XLDR	xs3, [ptr+pos+64]	; Get next vector (source data3)
+	vpaddb	xq1, xq1, xq1		; q1 = q1<<1
+	vpaddb	xq2, xq2, xq2		; q2 = q2<<1
+	vpaddb	xq3, xq3, xq3		; q3 = q3<<1
+	vpxor	xq1, xq1, xtmp1		; q1 = q1<<1 ^ poly_masked
+	vpxor	xq2, xq2, xtmp2		; q2 = q2<<1 ^ poly_masked
+	vpxor	xq3, xq3, xtmp3		; q3 = q3<<1 ^ poly_masked
+	jg	next_vect		; Loop for each vect except 0
+
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	vpxor	xp1, xp1, xs1		;p1 ^= s1[0] - last source is already loaded
+	vpxor	xq1, xq1, xs1		;q1 ^= 1 * s1[0]
+	vpxor	xp2, xp2, xs2		;p2 ^= s2[0]
+	vpxor	xq2, xq2, xs2		;q2 ^= 1 * s2[0]
+	vpxor	xp3, xp3, xs3		;p3 ^= s3[0]
+	vpxor	xq3, xq3, xs3		;q3 ^= 1 * s3[0]
+	XSTR	[ptr+pos], xp1		;Write parity P1 vector
+	XSTR	[ptr+pos+32], xp2	;Write parity P2 vector
+	XSTR	[ptr+pos+64], xp3	;Write parity P3 vector
+	XSTR	[tmp+pos], xq1		;Write parity Q1 vector
+	XSTR	[tmp+pos+32], xq2	;Write parity Q2 vector
+	XSTR	[tmp+pos+64], xq3	;Write parity Q3 vector
+	add	pos, 3*32
+	cmp	pos, len
+	jle	loop96
+
+	;; ------------------------------
+	;; Do last 16 or 32 Bytes remaining
+	add	len, 3*32
+	cmp	pos, len
+	je	return_pass
+
+loop32:
+	mov 	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	vpxor	xp1, xp1, xp1		;p = 0
+	vpxor	xq1, xq1, xq1		;q = 0
+
+next_vect32:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	vpxor	xq1, xq1, xs1		; q1 ^= s1
+	vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+	vpxor	xp1, xp1, xs1		; p ^= s
+	vpaddb	xq1, xq1, xq1		; q = q<<1
+	vpxor	xq1, xq1, xtmp1		; q = q<<1 ^ poly_masked
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data)
+	jg	next_vect32		; Loop for each vect except 0
+
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	vpxor	xp1, xp1, xs1		;p ^= s[0] - last source is already loaded
+	vpxor	xq1, xq1, xs1		;q ^= 1 * s[0]
+	XSTR	[ptr+pos], xp1		;Write parity P vector
+	XSTR	[tmp+pos], xq1		;Write parity Q vector
+	add	pos, 32
+	cmp	pos, len
+	jl	loop32
+
+
+return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 32
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;;       func        core,  ver, snum
+slversion pq_gen_avx2, 04,   03,  0041
--- a/raid/pq_gen_perf.c
+++ b/raid/pq_gen_perf.c
@ -0,0 +1,97 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include<sys/time.h>
+#include "raid.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 10
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   40000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 10
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
+#  define TEST_LOOPS   1000
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#   define TEST_LOOPS  1000
+#  endif
+# endif
+#endif
+
+#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN))
+
+int main(int argc, char *argv[])
+{
+	int i;
+	void *buffs[TEST_SOURCES + 2];
+	struct perf start, stop;
+
+	printf("Test pq_gen_perf %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES + 2; i++) {
+		int ret;
+		void *buf;
+		ret = posix_memalign(&buf, 32, TEST_LEN);
+		if (ret) {
+			printf("alloc error: Fail");
+			return 1;
+		}
+		buffs[i] = buf;
+	}
+
+	// Setup data
+	for (i = 0; i < TEST_SOURCES + 2; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	// Warm up
+	pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs);
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++)
+		pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs);
+	perf_stop(&stop);
+	printf("pq_gen" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_MEM * i);
+
+	return 0;
+}
--- a/raid/pq_gen_sse.asm
+++ b/raid/pq_gen_sse.asm
@ -0,0 +1,258 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_gen_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array).  Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes.  Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp3  arg4
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define tmp   r11
+ %define tmp3  r10
+ %define return rax
+ %define stack_size  7*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm15, 6*16
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm15, [rsp + 6*16]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos rax
+
+%define xp1   xmm0
+%define xq1   xmm1
+%define xtmp1 xmm2
+%define xs1   xmm3
+
+%define xp2   xmm4
+%define xq2   xmm5
+%define xtmp2 xmm6
+%define xs2   xmm7
+
+%define xp3   xmm8
+%define xq3   xmm9
+%define xtmp3 xmm10
+%define xs3   xmm11
+
+%define xpoly xmm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+global pq_gen_sse:function
+func(pq_gen_sse)
+	FUNC_SAVE
+	sub	vec, 3			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (16-1)		;Check alignment of length
+	jnz	return_fail
+	mov	pos, 0
+	movdqa	xpoly, [poly]
+	cmp	len, 48
+	jl	loop16
+
+len_aligned_32bytes:
+	sub	len, 48			;Len points to last block
+
+loop48:
+	mov 	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	XLDR	xs2, [ptr+pos+16]	;Preload last vector (source)
+	XLDR	xs3, [ptr+pos+32]	;Preload last vector (source)
+	pxor	xp1, xp1		;p1 = 0
+	pxor	xp2, xp2		;p2 = 0
+	pxor	xp3, xp3		;p3 = 0
+	pxor	xq1, xq1		;q1 = 0
+	pxor	xq2, xq2		;q2 = 0
+	pxor	xq3, xq3		;q3 = 0
+
+next_vect:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	pxor	xq1, xs1		; q1 ^= s1
+	pxor	xq2, xs2		; q2 ^= s2
+	pxor	xq3, xs3		; q3 ^= s3
+	pxor	xp1, xs1		; p1 ^= s1
+	pxor	xp2, xs2		; p2 ^= s2
+	pxor	xp3, xs3		; p3 ^= s2
+	pxor	xtmp1, xtmp1		; xtmp1 = 0 - for compare to 0
+	pxor	xtmp2, xtmp2		; xtmp2 = 0
+	pxor	xtmp3, xtmp3		; xtmp3 = 0
+	pcmpgtb	xtmp1, xq1		; xtmp1 = mask 0xff or 0x00 if bit7 set
+	pcmpgtb	xtmp2, xq2		; xtmp2 = mask 0xff or 0x00 if bit7 set
+	pcmpgtb	xtmp3, xq3		; xtmp3 = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp1 = poly or 0x00
+	pand	xtmp2, xpoly		; xtmp2 = poly or 0x00
+	pand	xtmp3, xpoly		; xtmp3 = poly or 0x00
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data1)
+	XLDR	xs2, [ptr+pos+16]	; Get next vector (source data2)
+	XLDR	xs3, [ptr+pos+32]	; Get next vector (source data3)
+	paddb	xq1, xq1		; q1 = q1<<1
+	paddb	xq2, xq2		; q2 = q2<<1
+	paddb	xq3, xq3		; q3 = q3<<1
+	pxor	xq1, xtmp1		; q1 = q1<<1 ^ poly_masked
+	pxor	xq2, xtmp2		; q2 = q2<<1 ^ poly_masked
+	pxor	xq3, xtmp3		; q3 = q3<<1 ^ poly_masked
+	jg	next_vect		; Loop for each vect except 0
+
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	pxor	xp1, xs1		;p1 ^= s1[0] - last source is already loaded
+	pxor	xq1, xs1		;q1 ^= 1 * s1[0]
+	pxor	xp2, xs2		;p2 ^= s2[0]
+	pxor	xq2, xs2		;q2 ^= 1 * s2[0]
+	pxor	xp3, xs3		;p3 ^= s3[0]
+	pxor	xq3, xs3		;q3 ^= 1 * s3[0]
+	XSTR	[ptr+pos], xp1		;Write parity P1 vector
+	XSTR	[ptr+pos+16], xp2	;Write parity P2 vector
+	XSTR	[ptr+pos+32], xp3	;Write parity P3 vector
+	XSTR	[tmp+pos], xq1		;Write parity Q1 vector
+	XSTR	[tmp+pos+16], xq2	;Write parity Q2 vector
+	XSTR	[tmp+pos+32], xq3	;Write parity Q3 vector
+	add	pos, 48
+	cmp	pos, len
+	jle	loop48
+
+	;; ------------------------------
+	;; Do last 16 or 32 Bytes remaining
+	add	len, 48
+	cmp	pos, len
+	je	return_pass
+
+loop16:
+	mov 	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	pxor	xp1, xp1		;p = 0
+	pxor	xq1, xq1		;q = 0
+
+next_vect16:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	pxor	xq1, xs1		; q1 ^= s1
+	pxor	xtmp1, xtmp1		; xtmp = 0
+	pcmpgtb	xtmp1, xq1		; xtmp = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp = poly or 0x00
+	pxor	xp1, xs1		; p ^= s
+	paddb	xq1, xq1		; q = q<<1
+	pxor	xq1, xtmp1		; q = q<<1 ^ poly_masked
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data)
+	jg	next_vect16		; Loop for each vect except 0
+
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	pxor	xp1, xs1		;p ^= s[0] - last source is already loaded
+	pxor	xq1, xs1		;q ^= 1 * s[0]
+	XSTR	[ptr+pos], xp1		;Write parity P vector
+	XSTR	[tmp+pos], xq1		;Write parity Q vector
+	add	pos, 16
+	cmp	pos, len
+	jl	loop16
+
+
+return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;;       func        core, ver, snum
+slversion pq_gen_sse, 00,   09,  0032
--- a/raid/pq_gen_sse_i32.asm
+++ b/raid/pq_gen_sse_i32.asm
@ -0,0 +1,264 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_gen_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array).  Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes.  Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define PS 8
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define return rax
+ %define PS 8
+ %define tmp   r10
+ %define stack_size  2*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	add	rsp, stack_size
+ %endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0   edx
+ %define arg1   ecx
+ %define return eax
+ %define PS 4
+ %define func(x) x:
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2  edi	; must sav/restore
+ %define arg3  esi
+ %define tmp   ebx
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg0, arg(0)
+	mov	arg1, arg(1)
+	mov	arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp	;if has frame pointer?
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos return
+
+%define xp1   xmm0
+%define xq1   xmm1
+%define xtmp1 xmm2
+%define xs1   xmm3
+
+%define xp2   xmm4
+%define xq2   xmm5
+%define xtmp2 xmm6
+%define xs2   xmm7
+
+%ifidn PS,8			; 64-bit code
+ default rel
+ [bits 64]
+ %define xpoly xmm15
+%elifidn PS,4			; 32-bit code
+ %define xpoly [poly]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+global pq_gen_sse:function
+func(pq_gen_sse)
+	FUNC_SAVE
+	sub	vec, 3			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (16-1)		;Check alignment of length
+	jnz	return_fail
+	mov	pos, 0
+%ifidn PS,8
+	movdqa	xpoly, [poly]		;For 64-bit, load poly into high xmm reg
+%endif
+	cmp	len, 32
+	jl	loop16
+
+len_aligned_32bytes:
+	sub	len, 32			;Do end of vec first and run backward
+
+loop32:
+	mov 	ptr, [arg2+vec*PS] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	XLDR	xs2, [ptr+pos+16]	;Preload last vector (source)
+	pxor	xp1, xp1		;p1 = 0
+	pxor	xq1, xq1		;q1 = 0
+	pxor	xp2, xp2		;p2 = 0
+	pxor	xq2, xq2		;q2 = 0
+
+next_vect:
+	sub	tmp, 1		 	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*PS] 	; get pointer to next vect
+	pxor	xq1, xs1		; q1 ^= s1
+	pxor	xq2, xs2		; q2 ^= s2
+	pxor	xp1, xs1		; p1 ^= s1
+	pxor	xp2, xs2		; p2 ^= s2
+	pxor	xtmp1, xtmp1		; xtmp1 = 0 - for compare to 0
+	pxor	xtmp2, xtmp2		; xtmp2 = 0
+	pcmpgtb	xtmp1, xq1		; xtmp1 = mask 0xff or 0x00 if bit7 set
+	pcmpgtb	xtmp2, xq2		; xtmp2 = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp1 = poly or 0x00
+	pand	xtmp2, xpoly		; xtmp2 = poly or 0x00
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data1)
+	XLDR	xs2, [ptr+pos+16]	; Get next vector (source data2)
+	paddb	xq1, xq1		; q1 = q1<<1
+	paddb	xq2, xq2		; q2 = q2<<1
+	pxor	xq1, xtmp1		; q1 = q1<<1 ^ poly_masked
+	pxor	xq2, xtmp2		; q2 = q2<<1 ^ poly_masked
+	jg	next_vect		; Loop for each vect except 0
+
+	mov	ptr, [arg2+PS+vec*PS]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+	pxor	xp1, xs1		;p1 ^= s1[0] - last source is already loaded
+	pxor	xq1, xs1		;q1 ^= 1 * s1[0]
+	pxor	xp2, xs2		;p2 ^= s2[0]
+	pxor	xq2, xs2		;q2 ^= 1 * s2[0]
+	XSTR	[ptr+pos], xp1		;Write parity P1 vector
+	XSTR	[ptr+pos+16], xp2	;Write parity P2 vector
+	XSTR	[tmp+pos], xq1		;Write parity Q1 vector
+	XSTR	[tmp+pos+16], xq2	;Write parity Q2 vector
+	add	pos, 32
+	cmp	pos, len
+	jle	loop32
+
+	;; ------------------------------
+	;; Do last 16 Bytes remaining
+	add	len, 32
+	cmp	pos, len
+	je	return_pass
+
+loop16:
+	mov 	ptr, [arg2+vec*PS] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	pxor	xp1, xp1		;p = 0
+	pxor	xq1, xq1		;q = 0
+
+next_vect16:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*PS] 	; get pointer to next vect
+	pxor	xq1, xs1		; q1 ^= s1
+	pxor	xtmp1, xtmp1		; xtmp = 0
+	pcmpgtb	xtmp1, xq1		; xtmp = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp = poly or 0x00
+	pxor	xp1, xs1		; p ^= s
+	paddb	xq1, xq1		; q = q<<1
+	pxor	xq1, xtmp1		; q = q<<1 ^ poly_masked
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data)
+	jg	next_vect16		; Loop for each vect except 0
+
+	mov	ptr, [arg2+PS+vec*PS]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+	pxor	xp1, xs1		;p ^= s[0] - last source is already loaded
+	pxor	xq1, xs1		;q ^= 1 * s[0]
+	XSTR	[ptr+pos], xp1		;Write parity P vector
+	XSTR	[tmp+pos], xq1		;Write parity Q vector
+	add	pos, 16
+	cmp	pos, len
+	jl	loop16
+
+
+return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;;       func        core, ver, snum
+slversion pq_gen_sse, 00,   08,  0032
--- a/raid/pq_gen_test.c
+++ b/raid/pq_gen_test.c
@ -0,0 +1,194 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include<limits.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN     1024
+#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", buf[i++]);
+		if (i % 16 == 0)
+			printf("\n");
+	}
+	printf("\n");
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, k, ret, fail = 0;
+	void *buffs[TEST_SOURCES + 2];	// Pointers to src and dest
+	char *tmp_buf[TEST_SOURCES + 2];
+
+	printf("Test pq_gen_test ");
+
+	srand(TEST_SEED);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES + 2; i++) {
+		void *buf;
+		ret = posix_memalign(&buf, 32, TEST_LEN);
+		if (ret) {
+			printf("alloc error: Fail");
+			return 1;
+		}
+		buffs[i] = buf;
+	}
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES + 2; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs);
+
+	for (i = 0; i < TEST_LEN; i++) {
+		if (((char *)buffs[TEST_SOURCES])[i] != 0)
+			fail++;
+	}
+
+	for (i = 0; i < TEST_LEN; i++) {
+		if (((char *)buffs[TEST_SOURCES + 1])[i] != 0)
+			fail++;
+	}
+
+	if (fail > 0) {
+		printf("fail zero test %d\n", fail);
+		return 1;
+	} else
+		putchar('.');
+
+	// Test rand1
+	for (i = 0; i < TEST_SOURCES + 2; i++)
+		rand_buffer(buffs[i], TEST_LEN);
+
+	ret = pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs);
+	fail |= pq_check_base(TEST_SOURCES + 2, TEST_LEN, buffs);
+
+	if (fail > 0) {
+		int t;
+		printf(" Fail rand test1 fail=%d, ret=%d\n", fail, ret);
+		for (t = 0; t < TEST_SOURCES + 2; t++)
+			dump(buffs[t], 15);
+
+		printf(" reference function p,q\n");
+		pq_gen_base(TEST_SOURCES + 2, TEST_LEN, buffs);
+		for (t = TEST_SOURCES; t < TEST_SOURCES + 2; t++)
+			dump(buffs[t], 15);
+
+		return 1;
+	} else
+		putchar('.');
+
+	// Test various number of sources
+	for (j = 4; j <= TEST_SOURCES + 2; j++) {
+		for (i = 0; i < j; i++)
+			rand_buffer(buffs[i], TEST_LEN);
+
+		pq_gen(j, TEST_LEN, buffs);
+		fail |= pq_check_base(j, TEST_LEN, buffs);
+
+		if (fail > 0) {
+			printf("fail rand test %d sources\n", j);
+			return 1;
+		} else
+			putchar('.');
+	}
+
+	fflush(0);
+
+	// Test various number of sources and len
+	k = 0;
+	while (k <= TEST_LEN) {
+		for (j = 4; j <= TEST_SOURCES + 2; j++) {
+			for (i = 0; i < j; i++)
+				rand_buffer(buffs[i], k);
+
+			ret = pq_gen(j, k, buffs);
+			fail |= pq_check_base(j, k, buffs);
+
+			if (fail > 0) {
+				printf("fail rand test %d sources, len=%d, fail="
+				       "%d, ret=%d\n", j, k, fail, ret);
+				return 1;
+			}
+		}
+		putchar('.');
+		k += 32;
+	}
+
+	// Test at the end of buffer
+	k = 0;
+	while (k <= TEST_LEN) {
+		for (j = 0; j < (TEST_SOURCES + 2); j++) {
+			rand_buffer(buffs[j], TEST_LEN - k);
+			tmp_buf[j] = (char *)buffs[j] + k;
+		}
+
+		ret = pq_gen(TEST_SOURCES + 2, TEST_LEN - k, (void *)tmp_buf);
+		fail |= pq_check_base(TEST_SOURCES + 2, TEST_LEN - k, (void *)tmp_buf);
+
+		if (fail > 0) {
+			printf("fail end test - offset: %d, len: %d, fail: %d, "
+			       "ret: %d\n", k, TEST_LEN - k, fail, ret);
+			return 1;
+		}
+
+		putchar('.');
+		fflush(0);
+		k += 32;
+	}
+
+	if (!fail)
+		printf(" done: Pass\n");
+
+	return fail;
+}
--- a/raid/raid_base.c
+++ b/raid/raid_base.c
@ -0,0 +1,147 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <limits.h>
+#include <stdint.h>
+
+#if __WORDSIZE == 64 || _WIN64 || __x86_64__
+# define notbit0 0xfefefefefefefefeULL
+# define bit7    0x8080808080808080ULL
+# define gf8poly 0x1d1d1d1d1d1d1d1dULL
+#else
+# define notbit0 0xfefefefeUL
+# define bit7    0x80808080UL
+# define gf8poly 0x1d1d1d1dUL
+#endif
+
+int pq_gen_base(int vects, int len, void **array)
+{
+	int i, j;
+	unsigned long p, q, s;
+	unsigned long **src = (unsigned long **)array;
+	int blocks = len / sizeof(long);
+
+	for (i = 0; i < blocks; i++) {
+		q = p = src[vects - 3][i];
+
+		for (j = vects - 4; j >= 0; j--) {
+			p ^= s = src[j][i];
+			q = s ^ (((q << 1) & notbit0) ^	// shift each byte
+				 ((((q & bit7) << 1) - ((q & bit7) >> 7))	// mask out bytes
+				  & gf8poly));	// apply poly
+		}
+
+		src[vects - 2][i] = p;	// second to last pointer is p
+		src[vects - 1][i] = q;	// last pointer is q
+	}
+	return 0;
+}
+
+int pq_check_base(int vects, int len, void **array)
+{
+	int i, j;
+	unsigned char p, q, s;
+	unsigned char **src = (unsigned char **)array;
+
+	for (i = 0; i < len; i++) {
+		q = p = src[vects - 3][i];
+
+		for (j = vects - 4; j >= 0; j--) {
+			s = src[j][i];
+			p ^= s;
+
+			// mult by GF{2}
+			q = s ^ ((q << 1) ^ ((q & 0x80) ? 0x1d : 0));
+		}
+
+		if (src[vects - 2][i] != p)	// second to last pointer is p
+			return i | 1;
+		if (src[vects - 1][i] != q)	// last pointer is q
+			return i | 2;
+	}
+	return 0;
+}
+
+int xor_gen_base(int vects, int len, void **array)
+{
+	int i, j;
+	unsigned char parity;
+	unsigned char **src = (unsigned char **)array;
+
+	for (i = 0; i < len; i++) {
+		parity = src[0][i];
+		for (j = 1; j < vects - 1; j++)
+			parity ^= src[j][i];
+
+		src[vects - 1][i] = parity;	// last pointer is dest
+
+	}
+
+	return 0;
+}
+
+int xor_check_base(int vects, int len, void **array)
+{
+	int i, j, fail = 0;
+
+	unsigned char parity;
+	unsigned char **src = (unsigned char **)array;
+
+	for (i = 0; i < len; i++) {
+		parity = 0;
+		for (j = 0; j < vects; j++)
+			parity ^= src[j][i];
+
+		if (parity != 0) {
+			fail = 1;
+			break;
+		}
+	}
+	if (fail && len > 0)
+		return len;
+	return fail;
+}
+
+struct slver {
+	unsigned short snum;
+	unsigned char ver;
+	unsigned char core;
+};
+
+struct slver pq_gen_base_slver_0001012a;
+struct slver pq_gen_base_slver = { 0x012a, 0x01, 0x00 };
+
+struct slver xor_gen_base_slver_0001012b;
+struct slver xor_gen_base_slver = { 0x012b, 0x01, 0x00 };
+
+struct slver pq_check_base_slver_0001012c;
+struct slver pq_check_base_slver = { 0x012c, 0x01, 0x00 };
+
+struct slver xor_check_base_slver_0001012d;
+struct slver xor_check_base_slver = { 0x012d, 0x01, 0x00 };
--- a/raid/raid_multibinary.asm
+++ b/raid/raid_multibinary.asm
@ -0,0 +1,140 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT         wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+default rel
+[bits 64]
+
+extern pq_gen_base
+extern pq_gen_sse
+extern pq_gen_avx
+extern pq_gen_avx2
+
+extern xor_gen_base
+extern xor_gen_sse
+extern xor_gen_avx
+
+extern pq_check_base
+extern pq_check_sse
+
+extern xor_check_base
+extern xor_check_sse
+
+mbin_interface xor_gen
+mbin_interface pq_gen
+
+mbin_dispatch_init5 xor_gen, xor_gen_base, xor_gen_sse, xor_gen_avx, xor_gen_avx
+mbin_dispatch_init5 pq_gen, pq_gen_base, pq_gen_sse, pq_gen_avx, pq_gen_avx2
+
+
+section .data
+
+xor_check_dispatched:
+	dq      xor_check_mbinit
+pq_check_dispatched:
+	dq      pq_check_mbinit
+
+section .text
+
+;;;;
+; pq_check multibinary function
+;;;;
+global pq_check:function
+pq_check_mbinit:
+	call	pq_check_dispatch_init
+pq_check:
+	jmp     qword [pq_check_dispatched]
+
+pq_check_dispatch_init:
+	push    rax
+	push    rbx
+	push    rcx
+	push    rdx
+	push    rsi
+	lea     rsi, [pq_check_base WRT_OPT] ; Default
+
+	mov     eax, 1
+	cpuid
+	test    ecx, FLAG_CPUID1_ECX_SSE4_1
+	lea     rbx, [pq_check_sse WRT_OPT]
+	cmovne	rsi, rbx
+
+	mov     [pq_check_dispatched], rsi
+	pop     rsi
+	pop     rdx
+	pop     rcx
+	pop     rbx
+	pop     rax
+	ret
+
+
+;;;;
+; xor_check multibinary function
+;;;;
+global xor_check:function
+xor_check_mbinit:
+	call    xor_check_dispatch_init
+xor_check:
+	jmp     qword [xor_check_dispatched]
+
+xor_check_dispatch_init:
+	push    rax
+	push    rbx
+	push    rcx
+	push    rdx
+	push    rsi
+	lea     rsi, [xor_check_base WRT_OPT] ; Default
+
+	mov     eax, 1
+	cpuid
+	test    ecx, FLAG_CPUID1_ECX_SSE4_1
+	lea     rbx, [xor_check_sse WRT_OPT]
+	cmovne  rsi, rbx
+
+	mov     [xor_check_dispatched], rsi
+	pop     rsi
+	pop     rdx
+	pop     rcx
+	pop     rbx
+	pop     rax
+	ret
+
+;;;       func          	core, ver, snum
+slversion xor_gen,		00,   03,  0126
+slversion xor_check,		00,   03,  0127
+slversion pq_gen,		00,   03,  0128
+slversion pq_check,		00,   03,  0129
--- a/raid/xor_check_sse.asm
+++ b/raid/xor_check_sse.asm
@ -0,0 +1,285 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized xor of N source vectors using SSE
+;;; int xor_gen_sse(int vects, int len, void **array)
+
+;;; Generates xor parity vector from N (vects-1) sources in array of pointers
+;;; (**array).  Last pointer is the dest.
+;;; Vectors must be aligned to 16 bytes.  Length can be any value.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2  rax
+ %define tmp2.b al
+ %define tmp3  arg4
+ %define return rax
+ %define PS 8
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define return rax
+ %define tmp2  rax
+ %define tmp2.b al
+ %define PS 8
+ %define tmp   r11
+ %define tmp3  r10
+ %define stack_size  2*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	add	rsp, stack_size
+ %endmacro
+
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0   arg(0)
+ %define arg1   ecx
+ %define tmp2   eax
+ %define tmp2.b  al
+ %define tmp3   edx
+ %define return eax
+ %define PS 4
+ %define func(x) x:
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2  edi	; must sav/restore
+ %define arg3  esi
+ %define tmp   ebx
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+	mov	arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp	;if has frame pointer
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos tmp3
+
+%ifidn PS,8			; 64-bit code
+ default rel
+ [bits 64]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+global xor_check_sse:function
+func(xor_check_sse)
+	FUNC_SAVE
+%ifidn PS,8				;64-bit code
+	sub	vec, 1			; Keep as offset to last source
+%else					;32-bit code
+	mov	tmp, arg(0)		; Update vec length arg to last source
+	sub	tmp, 1
+	mov	arg(0), tmp
+%endif
+
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (128-1)		;Check alignment of length
+	jnz	len_not_aligned
+
+
+len_aligned_128bytes:
+	sub	len, 128
+	mov	pos, 0
+	mov	tmp, vec		;Preset to last vector
+
+loop128:
+	mov	tmp2, [arg2+tmp*PS]	;Fetch last pointer in array
+	sub	tmp, 1			;Next vect
+	XLDR	xmm0, [tmp2+pos]	;Start with end of array in last vector
+	XLDR	xmm1, [tmp2+pos+16]	;Keep xor parity in xmm0-7
+	XLDR	xmm2, [tmp2+pos+(2*16)]
+	XLDR	xmm3, [tmp2+pos+(3*16)]
+	XLDR	xmm4, [tmp2+pos+(4*16)]
+	XLDR	xmm5, [tmp2+pos+(5*16)]
+	XLDR	xmm6, [tmp2+pos+(6*16)]
+	XLDR	xmm7, [tmp2+pos+(7*16)]
+
+next_vect:
+	mov 	ptr, [arg2+tmp*PS]
+	sub	tmp, 1
+	xorpd	xmm0, [ptr+pos]		;Get next vector (source)
+	xorpd	xmm1, [ptr+pos+16]
+	xorpd	xmm2, [ptr+pos+(2*16)]
+	xorpd	xmm3, [ptr+pos+(3*16)]
+	xorpd	xmm4, [ptr+pos+(4*16)]
+	xorpd	xmm5, [ptr+pos+(5*16)]
+	xorpd	xmm6, [ptr+pos+(6*16)]
+	xorpd	xmm7, [ptr+pos+(7*16)]
+;;;  	prefetch [ptr+pos+(8*16)]
+	jge	next_vect		;Loop for each vect
+
+	;; End of vects, chech that all parity regs = 0
+	mov	tmp, vec		;Back to last vector
+	por	xmm0, xmm1
+	por	xmm0, xmm2
+	por	xmm0, xmm3
+	por	xmm0, xmm4
+	por	xmm0, xmm5
+	por	xmm0, xmm6
+	por	xmm0, xmm7
+	ptest	xmm0, xmm0
+	jnz	return_fail
+
+	add	pos, 128
+	cmp	pos, len
+	jle	loop128
+
+return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+
+
+;;; Do one byte at a time for no alignment case
+
+xor_gen_byte:
+	mov	tmp, vec		;Preset to last vector
+
+loop_1byte:
+	mov 	ptr, [arg2+tmp*PS] 	;Fetch last pointer in array
+	mov	tmp2.b, [ptr+len-1]	;Get array n
+	sub	tmp, 1
+nextvect_1byte:
+	mov 	ptr, [arg2+tmp*PS]
+	xor	tmp2.b, [ptr+len-1]
+	sub	tmp, 1
+	jge	nextvect_1byte
+
+	mov	tmp, vec		;Back to last vector
+	cmp	tmp2.b, 0
+	jne	return_fail
+	sub	len, 1
+	test	len, (8-1)
+	jnz	loop_1byte
+
+	cmp	len, 0
+	je	return_pass
+	test	len, (128-1)		;If not 0 and 128bit aligned
+	jz	len_aligned_128bytes	; then do aligned case. len = y * 128
+
+	;; else we are 8-byte aligned so fall through to recheck
+
+
+	;; Unaligned length cases
+len_not_aligned:
+	test	len, (PS-1)
+	jne	xor_gen_byte
+	mov	tmp3, len
+	and	tmp3, (128-1)		;Do the unaligned bytes 4-8 at a time
+	mov	tmp, vec		;Preset to last vector
+
+	;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes
+loopN_bytes:
+	mov 	ptr, [arg2+tmp*PS] 	;Fetch last pointer in array
+	mov	tmp2, [ptr+len-PS]	;Get array n
+	sub	tmp, 1
+nextvect_Nbytes:
+	mov 	ptr, [arg2+tmp*PS] 	;Get pointer to next vector
+	xor	tmp2, [ptr+len-PS]
+	sub	tmp, 1
+	jge	nextvect_Nbytes		;Loop for each source
+
+	mov	tmp, vec		;Back to last vector
+	cmp	tmp2, 0
+	jne	return_fail
+	sub	len, PS
+	sub	tmp3, PS
+	jg	loopN_bytes
+
+	cmp	len, 128		;Now len is aligned to 128B
+	jge	len_aligned_128bytes	;We can do the rest aligned
+
+	cmp	len, 0
+	je	return_pass
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func           core, ver, snum
+slversion xor_check_sse, 00,   03,  0031
+
--- a/raid/xor_check_test.c
+++ b/raid/xor_check_test.c
@ -0,0 +1,280 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN     1024
+#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, k, ret, fail = 0;
+	void *buffs[TEST_SOURCES + 1];
+	char c;
+	int serr, lerr;
+	char *tmp_buf[TEST_SOURCES + 1];
+
+	printf("Test xor_check_test %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN);
+
+	srand(TEST_SEED);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES + 1; i++) {
+		void *buf;
+		if (posix_memalign(&buf, 16, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return 1;
+		}
+		buffs[i] = buf;
+	}
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES + 1; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	xor_gen_base(TEST_SOURCES + 1, TEST_LEN, buffs);
+	ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+	if (ret != 0) {
+		fail++;
+		printf("\nfail zero test %d\n", ret);
+	}
+
+	((char *)(buffs[0]))[TEST_LEN - 2] = 0x7;	// corrupt buffer
+	ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+	if (ret == 0) {
+		fail++;
+		printf("\nfail corrupt buffer test %d\n", ret);
+	}
+	((char *)(buffs[0]))[TEST_LEN - 2] = 0;	// un-corrupt buffer
+
+	// Test corrupted buffer any location on all sources
+	for (j = 0; j < TEST_SOURCES + 1; j++) {
+		for (i = TEST_LEN - 1; i >= 0; i--) {
+			((char *)buffs[j])[i] = 0x5;	// corrupt buffer
+			ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+			if (ret == 0) {
+				fail++;
+				printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i);
+				return 1;
+			}
+			((char *)buffs[j])[i] = 0;	// un-corrupt buffer
+		}
+		putchar('.');
+	}
+
+	// Test rand1
+	for (i = 0; i < TEST_SOURCES + 1; i++)
+		rand_buffer(buffs[i], TEST_LEN);
+
+	xor_gen_base(TEST_SOURCES + 1, TEST_LEN, buffs);
+	ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+	if (ret != 0) {
+		fail++;
+		printf("fail first rand test %d\n", ret);
+	}
+
+	c = ((char *)(buffs[0]))[TEST_LEN - 2];
+	((char *)(buffs[0]))[TEST_LEN - 2] = c ^ 0x1;
+	ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+	if (ret == 0) {
+		fail++;
+		printf("\nFail corrupt buffer test, passed when should have failed\n");
+	}
+	((char *)(buffs[0]))[TEST_LEN - 2] = c;	// un-corrupt buffer
+
+	// Test corrupted buffer any location on all sources w/ random data
+	for (j = 0; j < TEST_SOURCES + 1; j++) {
+		for (i = TEST_LEN - 1; i >= 0; i--) {
+			// Check it still passes
+			ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+			if (ret != 0) {	// should pass
+				fail++;
+				printf
+				    ("\nFail rand test with un-corrupted buffer j=%d, i=%d\n",
+				     j, i);
+				return 1;
+			}
+			c = ((char *)buffs[j])[i];
+			((char *)buffs[j])[i] = c ^ 1;	// corrupt buffer
+			ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+			if (ret == 0) {	// Check it now fails
+				fail++;
+				printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i);
+				return 1;
+			}
+			((char *)buffs[j])[i] = c;	// un-corrupt buffer
+		}
+		putchar('.');
+	}
+
+	// Test various number of sources, full length
+	for (j = 3; j <= TEST_SOURCES + 1; j++) {
+		// New random data
+		for (i = 0; i < j; i++)
+			rand_buffer(buffs[i], TEST_LEN);
+
+		// Generate xor parity for this number of sources
+		xor_gen_base(j, TEST_LEN, buffs);
+
+		// Set errors up in each source and len position
+		for (i = 0; i < j; i++) {
+			for (k = 0; k < TEST_LEN; k++) {
+				// See if it still passes
+				ret = xor_check(j, TEST_LEN, buffs);
+				if (ret != 0) {	// Should pass
+					printf("\nfail rand test %d sources\n", j);
+					fail++;
+					return 1;
+				}
+
+				c = ((char *)buffs[i])[k];
+				((char *)buffs[i])[k] = c ^ 1;	// corrupt buffer
+
+				ret = xor_check(j, TEST_LEN, buffs);
+				if (ret == 0) {	// Should fail
+					printf
+					    ("\nfail rand test corrupted buffer %d sources\n",
+					     j);
+					fail++;
+					return 1;
+				}
+				((char *)buffs[i])[k] = c;	// un-corrupt buffer
+			}
+		}
+		putchar('.');
+	}
+
+	fflush(0);
+
+	// Test various number of sources and len
+	k = 1;
+	while (k <= TEST_LEN) {
+		for (j = 3; j <= TEST_SOURCES + 1; j++) {
+			for (i = 0; i < j; i++)
+				rand_buffer(buffs[i], k);
+
+			// Generate xor parity for this number of sources
+			xor_gen_base(j, k, buffs);
+
+			// Inject errors at various source and len positions
+			for (lerr = 0; lerr < k; lerr += 10) {
+				for (serr = 0; serr < j; serr++) {
+
+					// See if it still passes
+					ret = xor_check(j, k, buffs);
+					if (ret != 0) {	// Should pass
+						printf("\nfail rand test %d sources\n", j);
+						fail++;
+						return 1;
+					}
+
+					c = ((char *)buffs[serr])[lerr];
+					((char *)buffs[serr])[lerr] = c ^ 1;	// corrupt buffer
+
+					ret = xor_check(j, k, buffs);
+					if (ret == 0) {	// Should fail
+						printf("\nfail rand test corrupted buffer "
+						       "%d sources, len=%d, ret=%d\n", j, k,
+						       ret);
+						fail++;
+						return 1;
+					}
+					((char *)buffs[serr])[lerr] = c;	// un-corrupt buffer
+				}
+			}
+		}
+		putchar('.');
+		fflush(0);
+		k += 1;
+	}
+
+	// Test at the end of buffer
+	for (i = 0; i < TEST_LEN; i += 32) {
+		for (j = 0; j < TEST_SOURCES + 1; j++) {
+			rand_buffer(buffs[j], TEST_LEN - i);
+			tmp_buf[j] = (char *)buffs[j] + i;
+		}
+
+		xor_gen_base(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+
+		// Test good data
+		ret = xor_check(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+		if (ret != 0) {
+			printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i);
+			fail++;
+			return 1;
+		}
+		// Test bad data
+		for (serr = 0; serr < TEST_SOURCES + 1; serr++) {
+			for (lerr = 0; lerr < (TEST_LEN - i); lerr++) {
+				c = tmp_buf[serr][lerr];
+				tmp_buf[serr][lerr] = c ^ 1;
+
+				ret =
+				    xor_check(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+				if (ret == 0) {
+					printf("fail end test corrupted buffer - "
+					       "offset: %d, len: %d, ret: %d\n", i,
+					       TEST_LEN - i, ret);
+					fail++;
+					return 1;
+				}
+
+				tmp_buf[serr][lerr] = c;
+			}
+		}
+
+		putchar('.');
+		fflush(0);
+	}
+
+	if (fail == 0)
+		printf("Pass\n");
+
+	return fail;
+
+}
--- a/raid/xor_example.c
+++ b/raid/xor_example.c
@ -0,0 +1,70 @@
+/**********************************************************************
+  Copyright(c) 2011-2013 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN     16*1024
+
+int main(int argc, char *argv[])
+{
+	int i, j, should_pass, should_fail;
+	void *buffs[TEST_SOURCES + 1];
+
+	printf("XOR example\n");
+	for (i = 0; i < TEST_SOURCES + 1; i++) {
+		void *buf;
+		if (posix_memalign(&buf, 16, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return 1;
+		}
+		buffs[i] = buf;
+	}
+
+	printf("Make random data\n");
+	for (i = 0; i < TEST_SOURCES + 1; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			((char *)buffs[i])[j] = rand();
+
+	printf("Generate xor parity\n");
+	xor_gen_sse(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+	printf("Check parity: ");
+	should_pass = xor_check_sse(TEST_SOURCES + 1, TEST_LEN, buffs);
+	printf("%s\n", should_pass == 0 ? "Pass" : "Fail");
+
+	printf("Find corruption: ");
+	((char *)buffs[TEST_SOURCES / 2])[TEST_LEN / 2] ^= 1;	// flip one bit
+	should_fail = xor_check_sse(TEST_SOURCES + 1, TEST_LEN, buffs);	//recheck
+	printf("%s\n", should_fail != 0 ? "Pass" : "Fail");
+
+	return 0;
+}
--- a/raid/xor_gen_avx.asm
+++ b/raid/xor_gen_avx.asm
@ -0,0 +1,228 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized xor of N source vectors using AVX
+;;; int xor_gen_avx(int vects, int len, void **array)
+
+;;; Generates xor parity vector from N (vects-1) sources in array of pointers
+;;; (**array).  Last pointer is the dest.
+;;; Vectors must be aligned to 32 bytes.  Length can be any value.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp3  arg4
+ %define func(x) x:
+ %define return rax
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define tmp   r11
+ %define tmp3  r10
+ %define func(x) proc_frame x
+ %define return rax
+ %define stack_size  2*32 + 8 	;must be an odd multiple of 8
+
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqu	[rsp + 0*32], ymm6
+	vmovdqu	[rsp + 1*32], ymm7
+	end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+	vmovdqu	ymm6, [rsp + 0*32]
+	vmovdqu	ymm7, [rsp + 1*32]
+	add	rsp, stack_size
+ %endmacro
+
+%endif	;output formats
+
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define tmp2 rax
+%define tmp2.b al
+%define pos tmp3
+%define PS 8
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+%else
+ %define XLDR vmovdqa
+ %define XSTR vmovntdq
+%endif
+
+
+default rel
+[bits 64]
+
+section .text
+
+align 16
+global xor_gen_avx:function
+func(xor_gen_avx)
+
+	FUNC_SAVE
+	sub	vec, 2			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (128-1)		;Check alignment of length
+	jnz	len_not_aligned
+
+
+len_aligned_128bytes:
+	sub	len, 128
+	mov	pos, 0
+
+loop128:
+	mov	tmp, vec		;Back to last vector
+	mov	tmp2, [arg2+vec*PS]	;Fetch last pointer in array
+	sub	tmp, 1			;Next vect
+	XLDR	ymm0, [tmp2+pos]	;Start with end of array in last vector
+	XLDR	ymm1, [tmp2+pos+32]	;Keep xor parity in xmm0-7
+	XLDR	ymm2, [tmp2+pos+(2*32)]
+	XLDR	ymm3, [tmp2+pos+(3*32)]
+
+next_vect:
+	mov 	ptr, [arg2+tmp*PS]
+	sub	tmp, 1
+	XLDR	ymm4, [ptr+pos]		;Get next vector (source)
+	XLDR	ymm5, [ptr+pos+32]
+	XLDR	ymm6, [ptr+pos+(2*32)]
+	XLDR	ymm7, [ptr+pos+(3*32)]
+	vxorpd	ymm0, ymm0, ymm4	;Add to xor parity
+	vxorpd	ymm1, ymm1, ymm5
+	vxorpd	ymm2, ymm2, ymm6
+	vxorpd	ymm3, ymm3, ymm7
+	jge	next_vect		;Loop for each source
+
+	mov	ptr, [arg2+PS+vec*PS]	;Address of parity vector
+	XSTR	[ptr+pos], ymm0		;Write parity xor vector
+	XSTR	[ptr+pos+(1*32)], ymm1
+	XSTR	[ptr+pos+(2*32)], ymm2
+	XSTR	[ptr+pos+(3*32)], ymm3
+	add	pos, 128
+	cmp	pos, len
+	jle	loop128
+
+return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+
+;;; Do one byte at a time for no alignment case
+loop_1byte:
+	mov	tmp, vec		;Back to last vector
+	mov 	ptr, [arg2+vec*PS] 	;Fetch last pointer in array
+	mov	tmp2.b, [ptr+len-1]	;Get array n
+	sub	tmp, 1
+nextvect_1byte:
+	mov 	ptr, [arg2+tmp*PS]
+	xor	tmp2.b, [ptr+len-1]
+	sub	tmp, 1
+	jge	nextvect_1byte
+
+	mov	tmp, vec
+	add	tmp, 1		  	;Add back to point to last vec
+	mov	ptr, [arg2+tmp*PS]
+	mov	[ptr+len-1], tmp2.b 	;Write parity
+	sub	len, 1
+	test	len, (PS-1)
+	jnz	loop_1byte
+
+	cmp	len, 0
+	je	return_pass
+	test	len, (128-1)		;If not 0 and 128bit aligned
+	jz	len_aligned_128bytes	; then do aligned case. len = y * 128
+
+	;; else we are 8-byte aligned so fall through to recheck
+
+
+	;; Unaligned length cases
+len_not_aligned:
+	test	len, (PS-1)
+	jne	loop_1byte
+	mov	tmp3, len
+	and	tmp3, (128-1)		;Do the unaligned bytes 8 at a time
+
+	;; Run backwards 8 bytes at a time for (tmp3) bytes
+loop8_bytes:
+	mov	tmp, vec		;Back to last vector
+	mov 	ptr, [arg2+vec*PS] 	;Fetch last pointer in array
+	mov	tmp2, [ptr+len-PS]	;Get array n
+	sub	tmp, 1
+nextvect_8bytes:
+	mov 	ptr, [arg2+tmp*PS] 	;Get pointer to next vector
+	xor	tmp2, [ptr+len-PS]
+	sub	tmp, 1
+	jge	nextvect_8bytes		;Loop for each source
+
+	mov	tmp, vec
+	add	tmp, 1		  	;Add back to point to last vec
+	mov	ptr, [arg2+tmp*PS]
+	mov	[ptr+len-PS], tmp2	;Write parity
+	sub	len, PS
+	sub	tmp3, PS
+	jg	loop8_bytes
+
+	cmp	len, 128		;Now len is aligned to 128B
+	jge	len_aligned_128bytes	;We can do the rest aligned
+
+	cmp	len, 0
+	je	return_pass
+
+return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func         core, ver, snum
+slversion xor_gen_avx, 02,   05,  0037
+
--- a/raid/xor_gen_perf.c
+++ b/raid/xor_gen_perf.c
@ -0,0 +1,98 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include<sys/time.h>
+#include "raid.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Loop many times over same 
+# define TEST_SOURCES 10
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+# define TEST_SOURCES 10
+# define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+# define TEST_LEN     GT_L3_CACHE / TEST_SOURCES
+# define TEST_LOOPS   1000
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN))
+
+int main(int argc, char *argv[])
+{
+	int i, ret, fail = 0;
+	void **buffs;
+	void *buff;
+	struct perf start, stop;
+
+	printf("Test xor_gen_perf\n");
+
+	ret = posix_memalign((void **)&buff, 8, sizeof(int *) * (TEST_SOURCES + 6));
+	if (ret) {
+		printf("alloc error: Fail");
+		return 1;
+	}
+	buffs = buff;
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES + 1; i++) {
+		void *buf;
+		ret = posix_memalign(&buf, 32, TEST_LEN);
+		if (ret) {
+			printf("alloc error: Fail");
+			return 1;
+		}
+		buffs[i] = buf;
+	}
+
+	// Setup data
+	for (i = 0; i < TEST_SOURCES + 1; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	// Warm up
+	xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++)
+		xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs);
+	perf_stop(&stop);
+	printf("xor_gen" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_MEM * i);
+
+	return fail;
+}
--- a/raid/xor_gen_sse.asm
+++ b/raid/xor_gen_sse.asm
@ -0,0 +1,284 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized xor of N source vectors using SSE
+;;; int xor_gen_sse(int vects, int len, void **array)
+
+;;; Generates xor parity vector from N (vects-1) sources in array of pointers
+;;; (**array).  Last pointer is the dest.
+;;; Vectors must be aligned to 16 bytes.  Length can be any value.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2  rax
+ %define tmp2.b al
+ %define tmp3  arg4
+ %define return rax
+ %define PS 8
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define return rax
+ %define tmp2  rax
+ %define tmp2.b al
+ %define PS 8
+ %define tmp   r11
+ %define tmp3  r10
+ %define stack_size  2*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	add	rsp, stack_size
+ %endmacro
+
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0   arg(0)
+ %define arg1   ecx
+ %define tmp2   eax
+ %define tmp2.b  al
+ %define tmp3   edx
+ %define return eax
+ %define PS 4
+ %define func(x) x:
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2  edi	; must sav/restore
+ %define arg3  esi
+ %define tmp   ebx
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+	mov	arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp	;if has frame pointer
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos tmp3
+
+%ifidn PS,8			; 64-bit code
+ default rel
+ [bits 64]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+global xor_gen_sse:function
+func(xor_gen_sse)
+	FUNC_SAVE
+%ifidn PS,8				;64-bit code
+	sub	vec, 2			; Keep as offset to last source
+%else					;32-bit code
+	mov	tmp, arg(0)		; Update vec length arg to last source
+	sub	tmp, 2
+	mov	arg(0), tmp
+%endif
+
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (128-1)		;Check alignment of length
+	jnz	len_not_aligned
+
+
+len_aligned_128bytes:
+	sub	len, 128
+	mov	pos, 0
+	mov	tmp, vec		;Preset to last vector
+
+loop128:
+	mov	tmp2, [arg2+tmp*PS]	;Fetch last pointer in array
+	sub	tmp, 1			;Next vect
+	XLDR	xmm0, [tmp2+pos]	;Start with end of array in last vector
+	XLDR	xmm1, [tmp2+pos+16]	;Keep xor parity in xmm0-7
+	XLDR	xmm2, [tmp2+pos+(2*16)]
+	XLDR	xmm3, [tmp2+pos+(3*16)]
+	XLDR	xmm4, [tmp2+pos+(4*16)]
+	XLDR	xmm5, [tmp2+pos+(5*16)]
+	XLDR	xmm6, [tmp2+pos+(6*16)]
+	XLDR	xmm7, [tmp2+pos+(7*16)]
+
+next_vect:
+	mov 	ptr, [arg2+tmp*PS]
+	sub	tmp, 1
+	xorpd	xmm0, [ptr+pos]		;Get next vector (source)
+	xorpd	xmm1, [ptr+pos+16]
+	xorpd	xmm2, [ptr+pos+(2*16)]
+	xorpd	xmm3, [ptr+pos+(3*16)]
+	xorpd	xmm4, [ptr+pos+(4*16)]
+	xorpd	xmm5, [ptr+pos+(5*16)]
+	xorpd	xmm6, [ptr+pos+(6*16)]
+	xorpd	xmm7, [ptr+pos+(7*16)]
+;;;  	prefetch [ptr+pos+(8*16)]
+	jge	next_vect		;Loop for each vect
+
+
+	mov	tmp, vec		;Back to last vector
+	mov	ptr, [arg2+PS+tmp*PS]	;Address of parity vector
+	XSTR	[ptr+pos], xmm0		;Write parity xor vector
+	XSTR	[ptr+pos+(1*16)], xmm1
+	XSTR	[ptr+pos+(2*16)], xmm2
+	XSTR	[ptr+pos+(3*16)], xmm3
+	XSTR	[ptr+pos+(4*16)], xmm4
+	XSTR	[ptr+pos+(5*16)], xmm5
+	XSTR	[ptr+pos+(6*16)], xmm6
+	XSTR	[ptr+pos+(7*16)], xmm7
+	add	pos, 128
+	cmp	pos, len
+	jle	loop128
+
+return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+
+
+;;; Do one byte at a time for no alignment case
+
+xor_gen_byte:
+	mov	tmp, vec		;Preset to last vector
+
+loop_1byte:
+	mov 	ptr, [arg2+tmp*PS] 	;Fetch last pointer in array
+	mov	tmp2.b, [ptr+len-1]	;Get array n
+	sub	tmp, 1
+nextvect_1byte:
+	mov 	ptr, [arg2+tmp*PS]
+	xor	tmp2.b, [ptr+len-1]
+	sub	tmp, 1
+	jge	nextvect_1byte
+
+	mov	tmp, vec		;Back to last vector
+	mov	ptr, [arg2+PS+tmp*PS]	;Get last vec
+	mov	[ptr+len-1], tmp2.b 	;Write parity
+	sub	len, 1
+	test	len, (8-1)
+	jnz	loop_1byte
+
+	cmp	len, 0
+	je	return_pass
+	test	len, (128-1)		;If not 0 and 128bit aligned
+	jz	len_aligned_128bytes	; then do aligned case. len = y * 128
+
+	;; else we are 8-byte aligned so fall through to recheck
+
+
+	;; Unaligned length cases
+len_not_aligned:
+	test	len, (PS-1)
+	jne	xor_gen_byte
+	mov	tmp3, len
+	and	tmp3, (128-1)		;Do the unaligned bytes 4-8 at a time
+	mov	tmp, vec		;Preset to last vector
+
+	;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes
+loopN_bytes:
+	mov 	ptr, [arg2+tmp*PS] 	;Fetch last pointer in array
+	mov	tmp2, [ptr+len-PS]	;Get array n
+	sub	tmp, 1
+nextvect_Nbytes:
+	mov 	ptr, [arg2+tmp*PS] 	;Get pointer to next vector
+	xor	tmp2, [ptr+len-PS]
+	sub	tmp, 1
+	jge	nextvect_Nbytes		;Loop for each source
+
+	mov	tmp, vec		;Back to last vector
+	mov	ptr, [arg2+PS+tmp*PS]	;Get last vec
+	mov	[ptr+len-PS], tmp2 	;Write parity
+	sub	len, PS
+	sub	tmp3, PS
+	jg	loopN_bytes
+
+	cmp	len, 128		;Now len is aligned to 128B
+	jge	len_aligned_128bytes	;We can do the rest aligned
+
+	cmp	len, 0
+	je	return_pass
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func         core, ver, snum
+slversion xor_gen_sse, 00,   0c,  0030
+
--- a/raid/xor_gen_test.c
+++ b/raid/xor_gen_test.c
@ -0,0 +1,165 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN     1024
+#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, k, ret, fail = 0;
+	void *buffs[TEST_SOURCES + 1];
+	char *tmp_buf[TEST_SOURCES + 1];
+
+	printf("Test xor_gen_test ");
+
+	srand(TEST_SEED);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES + 1; i++) {
+		void *buf;
+		ret = posix_memalign(&buf, 32, TEST_LEN);
+		if (ret) {
+			printf("alloc error: Fail");
+			return 1;
+		}
+		buffs[i] = buf;
+	}
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES + 1; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+	for (i = 0; i < TEST_LEN; i++) {
+		if (((char *)buffs[TEST_SOURCES])[i] != 0)
+			fail++;
+	}
+
+	if (fail > 0) {
+		printf("fail zero test");
+		return 1;
+	} else
+		putchar('.');
+
+	// Test rand1
+	for (i = 0; i < TEST_SOURCES + 1; i++)
+		rand_buffer(buffs[i], TEST_LEN);
+
+	xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+	fail |= xor_check_base(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+	if (fail > 0) {
+		printf("fail rand test %d\n", fail);
+		return 1;
+	} else
+		putchar('.');
+
+	// Test various number of sources
+	for (j = 3; j <= TEST_SOURCES + 1; j++) {
+		for (i = 0; i < j; i++)
+			rand_buffer(buffs[i], TEST_LEN);
+
+		xor_gen(j, TEST_LEN, buffs);
+		fail |= xor_check_base(j, TEST_LEN, buffs);
+
+		if (fail > 0) {
+			printf("fail rand test %d sources\n", j);
+			return 1;
+		} else
+			putchar('.');
+	}
+
+	fflush(0);
+
+	// Test various number of sources and len
+	k = 0;
+	while (k <= TEST_LEN) {
+		for (j = 3; j <= TEST_SOURCES + 1; j++) {
+			for (i = 0; i < j; i++)
+				rand_buffer(buffs[i], k);
+
+			xor_gen(j, k, buffs);
+			fail |= xor_check_base(j, k, buffs);
+
+			if (fail > 0) {
+				printf("fail rand test %d sources, len=%d, ret=%d\n", j, k,
+				       fail);
+				return 1;
+			}
+		}
+		putchar('.');
+		k += 1;
+	}
+
+	// Test at the end of buffer
+	for (i = 0; i < TEST_LEN; i += 32) {
+		for (j = 0; j < TEST_SOURCES + 1; j++) {
+			rand_buffer((unsigned char *)buffs[j] + i, TEST_LEN - i);
+			tmp_buf[j] = (char *)buffs[j] + i;
+		}
+
+		xor_gen(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+		fail |= xor_check_base(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+
+		if (fail > 0) {
+			printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i);
+			return 1;
+		}
+
+		putchar('.');
+		fflush(0);
+	}
+
+	if (!fail)
+		printf(" done: Pass\n");
+
+	return fail;
+}