|
|
|
|
@@ -377,12 +377,15 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output)
|
|
|
|
|
_mm_store_si128(output + 1, xin1);
|
|
|
|
|
_mm_store_si128(output + 2, xin2);
|
|
|
|
|
_mm_store_si128(output + 3, xin3);
|
|
|
|
|
output += (64 << interleave) / sizeof(__m128i);
|
|
|
|
|
_mm_store_si128(output + 0, xin4);
|
|
|
|
|
_mm_store_si128(output + 1, xin5);
|
|
|
|
|
_mm_store_si128(output + 2, xin6);
|
|
|
|
|
_mm_store_si128(output + 3, xin7);
|
|
|
|
|
output += (64 << interleave) / sizeof(__m128i);
|
|
|
|
|
|
|
|
|
|
constexpr int output_increment = (64 << interleave) / sizeof(__m128i);
|
|
|
|
|
|
|
|
|
|
_mm_store_si128(output + output_increment + 0, xin4);
|
|
|
|
|
_mm_store_si128(output + output_increment + 1, xin5);
|
|
|
|
|
_mm_store_si128(output + output_increment + 2, xin6);
|
|
|
|
|
_mm_store_si128(output + output_increment + 3, xin7);
|
|
|
|
|
|
|
|
|
|
output += output_increment * 2;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -414,13 +417,15 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
|
|
|
|
|
xout1 = _mm_xor_si128(_mm_load_si128(input + 1), xout1);
|
|
|
|
|
xout2 = _mm_xor_si128(_mm_load_si128(input + 2), xout2);
|
|
|
|
|
xout3 = _mm_xor_si128(_mm_load_si128(input + 3), xout3);
|
|
|
|
|
input += (64 << interleave) / sizeof(__m128i);
|
|
|
|
|
xout4 = _mm_xor_si128(_mm_load_si128(input + 0), xout4);
|
|
|
|
|
xout5 = _mm_xor_si128(_mm_load_si128(input + 1), xout5);
|
|
|
|
|
xout6 = _mm_xor_si128(_mm_load_si128(input + 2), xout6);
|
|
|
|
|
xout7 = _mm_xor_si128(_mm_load_si128(input + 3), xout7);
|
|
|
|
|
input += (64 << interleave) / sizeof(__m128i);
|
|
|
|
|
|
|
|
|
|
constexpr int input_increment = (64 << interleave) / sizeof(__m128i);
|
|
|
|
|
|
|
|
|
|
xout4 = _mm_xor_si128(_mm_load_si128(input + input_increment + 0), xout4);
|
|
|
|
|
xout5 = _mm_xor_si128(_mm_load_si128(input + input_increment + 1), xout5);
|
|
|
|
|
xout6 = _mm_xor_si128(_mm_load_si128(input + input_increment + 2), xout6);
|
|
|
|
|
xout7 = _mm_xor_si128(_mm_load_si128(input + input_increment + 3), xout7);
|
|
|
|
|
|
|
|
|
|
input += input_increment * 2;
|
|
|
|
|
i += 8;
|
|
|
|
|
|
|
|
|
|
if ((interleave > 0) && (i < props.memory() / sizeof(__m128i))) {
|
|
|
|
|
|