block_class: add aarch64 SIMD

Speedup of 2.3x for this particular function. Can probably be improved somewhat still.
2021-05-11 11:57:52 +02:00
parent 86963c5de9
commit 772d73ce04
1 changed files with 15 additions and 0 deletions
--- a/overviewer_core/src/block_class.c
+++ b/overviewer_core/src/block_class.c
@@ -21,6 +21,9 @@
 #if defined(__i386__) || defined(__x86_64__)
 #include <immintrin.h>
 #endif
+#if defined(__aarch64__)
+#include <arm_neon.h>
+#endif

 bool block_class_is_subset(
    mc_block_t block,
@@ -39,6 +42,18 @@ bool block_class_is_subset(
        }
    }
 #endif
+#if defined(__aarch64__)
+    for (; i / 8 < block_class_len / 8; i += 8) {
+        const uint16x8_t block_class_vec = vld1q_u16(
+            (uint16_t*)&block_class[i]);
+        const uint16x8_t block_vec = vmovq_n_u16(block);
+        const uint16x8_t block_cmp = vceqq_u16(block_vec, (uint16x8_t) block_class_vec);
+        if(vgetq_lane_s64((int64x2_t) block_cmp, 0) |
+           vgetq_lane_s64((int64x2_t) block_cmp, 1)) {
+            return true;
+        }
+    }
+#endif
 #ifdef __MMX__
    for (; i / 4 < block_class_len / 4; i += 4) {
        const __m64 block_class_vec = _mm_cvtsi64_m64(