block_class: add aarch64 SIMD
Speedup of 2.3x for this particular function. Can probably be improved somewhat still.
This commit is contained in:
@@ -21,6 +21,9 @@
|
|||||||
#if defined(__i386__) || defined(__x86_64__)
|
#if defined(__i386__) || defined(__x86_64__)
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(__aarch64__)
|
||||||
|
#include <arm_neon.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
bool block_class_is_subset(
|
bool block_class_is_subset(
|
||||||
mc_block_t block,
|
mc_block_t block,
|
||||||
@@ -39,6 +42,18 @@ bool block_class_is_subset(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(__aarch64__)
|
||||||
|
for (; i / 8 < block_class_len / 8; i += 8) {
|
||||||
|
const uint16x8_t block_class_vec = vld1q_u16(
|
||||||
|
(uint16_t*)&block_class[i]);
|
||||||
|
const uint16x8_t block_vec = vmovq_n_u16(block);
|
||||||
|
const uint16x8_t block_cmp = vceqq_u16(block_vec, (uint16x8_t) block_class_vec);
|
||||||
|
if(vgetq_lane_s64((int64x2_t) block_cmp, 0) |
|
||||||
|
vgetq_lane_s64((int64x2_t) block_cmp, 1)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#ifdef __MMX__
|
#ifdef __MMX__
|
||||||
for (; i / 4 < block_class_len / 4; i += 4) {
|
for (; i / 4 < block_class_len / 4; i += 4) {
|
||||||
const __m64 block_class_vec = _mm_cvtsi64_m64(
|
const __m64 block_class_vec = _mm_cvtsi64_m64(
|
||||||
|
|||||||
Reference in New Issue
Block a user