1 思路
由于 ARMv7 开始支持 Neon 指令集, 因此可以把 SSE 指令翻译成 Neon 指令. 有一些开源项目已经支持这种”指令翻译”, 比如 sse2neon, simde 等. 目前来看 simde 支持的最好.
当然这种思路也有限制, 只支持小端字节序的平台. 如果要移植到大端字节序平台, 则需要改动 hyperscan 大量源码… 几乎是不可能的任务, 太难维护了.
我移植的代码见 github.com/zzqcn/hyperscan/ 中的 porting 分支.
2 编译依赖项
需要先安装:
- 交叉编译工具链(toolchain) 本文假设放在
/opt/toolchain - cmake
- ragel
- Boost库 下载压缩包后解压即可, 不需要编译
- SIMDe 本文假设安装在
/dev/zzq/dev/simde3 配置cmake编译环境
准备toolchain file
在 cmake 下面建立 toolchain 子目录, 编写 armv7.cmake 文件: ```makefileARMv7
set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_SYSTEM_PROCESSOR arm)
set(TOOLCHAIN_DIR /opt/toolchain/crosstools-arm-gcc-5.3-linux-4.1-glibc-2.22-binutils-2.25) set(TOOLCHAIN_INCLUDE ${TOOLCHAIN_DIR}/usr/include ${TOOLCHAIN_DIR}/usr/arm-buildroot-linux-gnueabi/sysroot/include ) set(TOOLCHAIN_LIB ${TOOLCHAIN_DIR}/usr/lib )
set(CMAKE_SYSROOT ${TOOLCHAIN_DIR}/usr/arm-buildroot-linux-gnueabi/sysroot)
set(CMAKE_C_COMPILER ${TOOLCHAIN_DIR}/usr/bin/arm-buildroot-linux-gnueabi-gcc) set(CMAKE_CXX_COMPILER ${TOOLCHAIN_DIR}/usr/bin/arm-buildroot-linux-gnueabi-g++)
set(CMAKE_C_COMPILER_WORKS 1) set(CMAKE_CXX_COMPILER_WORKS 1)
set(CMAKE_C_FLAGS “${CMAKE_C_FLAGS} -march=armv7-a -mfpu=neon”)
set(CMAKE_CXX_FLAGS “${CMAKE_CXX_FLAGS} -march=armv7-a -mfpu=neon”)
set(CMAKE_FIND_ROOT_PATH ${TOOLCHAIN_DIR}/usr/arm-buildroot-linux-gnueabi)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
set(SIMDE_INCLUDE /home/zzq/dev/simde/simde CACHE PATH “SIMDe include directory”)
include_directories( ${TOOLCHAIN_DIR}/usr/arm-buildroot-linux-gnueabi/sysroot/include ${TOOLCHAIN_DIR}/usr/include ${SIMDE_INCLUDE} )
具体细节请参考 [https://cmake.org/cmake/help/latest/manual/cmake-toolchains.7.html](https://cmake.org/cmake/help/latest/manual/cmake-toolchains.7.html)<a name="hZ5i5"></a>## 修改CMakeList.txt屏蔽掉编译时会出错的行.<br />一处是会检查 `-march` , `-mtune` , 把这里设置为在x86平台编译时才检查:```makefileif (CMAKE_SYSTEM_PROCESSOR MATCHES "^x86")# arg1 might exist if using ccachestring (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native)execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}OUTPUT_VARIABLE _GCC_OUTPUT)string(REGEX REPLACE ".*march=[ \t]*([^ \n]*)[ \n].*" "\\1"GNUCC_ARCH "${_GCC_OUTPUT}")# test the parsed flagset (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH})execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}OUTPUT_QUIET ERROR_QUIETINPUT_FILE /dev/nullRESULT_VARIABLE GNUCC_TUNE_TEST)if (NOT GNUCC_TUNE_TEST EQUAL 0)message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid")endif()endif()
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^x86") 是判断当前平台是不是 x86 的一个技巧, 下文会经常用到.
第二处是检查 x86 intrinsics 头文件, 这里也改为只在 x86 上检查:
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^x86")CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)endif()
修改cmake/arch.cmake
这个文件会检查 x86 指令集可用性, 同样设置为只在 x86 上检查, 而在 ARM 平台假装支持 SSSE3.
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^x86")if (HAVE_C_X86INTRIN_H)set (INTRIN_INC_H "x86intrin.h")...elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")set(HAVE_SSSE3 TRUE)else()message(FATAL_ERROR "don't support processor " ${CMAKE_SYSTEM_PROCESSOR})endif()
修改cmake/platform.cmake
这个文件会检查当前平台, 改为在 x86 上检查, 而在 ARM 默认设置为 ARM32 位, 这里的设置最终会影响 config.h 的输出.
# determine the target archif (CMAKE_SYSTEM_PROCESSOR MATCHES "^x86")# really only interested in the preprocessor hereCHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_64_BIT)CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT)set(ARCH_X86_64 ${ARCH_64_BIT})set(ARCH_IA32 ${ARCH_32_BIT})elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")# TODO: XXXset(ARCH_32_BIT TRUE)set(ARCH_ARM32 TRUE)else()message(FATAL_ERROR "don't support processor " ${CMAKE_SYSTEM_PROCESSOR})endif()
修改config.h.in
添加
#cmakedefine ARCH_ARM32
这个文件相当于一个模板, 会根据 cmake 文件的判定结果生成最终的 config.h. 对于我们的 ARMv7 移植, config.h 中会生成 #define ARCH_ARM32 .
其他cmake小技巧
进行正则匹配, 检查当前平台名称
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^x86")elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")set(HAVE_SSSE3 TRUE)else()
输出状态信息或错误信息:
message(STATUS "XXXXXXX")message(FATAL_ERROR "XXXXXXXXXXX")
在 cmake 文件里控制 config.h.in 文件里的宏生成:
# cmake:set(ARCH_32_BIT TRUE)set(ARCH_ARM32 TRUE)# config.h.in:#cmakedefine ARCH_32_BIT#cmakedefine ARCH_ARM32
打印 include 路径
get_property(dirs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)foreach(dir ${dirs})message(STATUS "dir='${dir}'")endforeach()
4 修改hyperscan源码
修改arch.h
添加:
#if defined(__arm__)#define NO_ASM#endif
修改intrinsics.h
把原有代码包含在 x86 的判断条件里, 而 arm 时包含 sse_helper.h.
#if defined(ARCH_IA32) || defined(ARCH_X86_64)#ifdef __cplusplus# if defined(HAVE_CXX_X86INTRIN_H)# define USE_X86INTRIN_H...#elif defined(ARCH_ARM32)#include "sse_helper.h"#endif
这个 sse_helper.h 是我们新添加的代码, 是移植的关键!
添加sse_helper.h
很简单, 把 hyperscan 用到的 SSSE 和之前的指令翻译成 SIMDe 的函数:
#ifndef SSE_HELPER_H#define SSE_HELPER_H#include "x86/ssse3.h"#ifdef __cplusplusextern "C" {#endif#define _mm_cmpeq_epi8 simde_mm_cmpeq_epi8#define _mm_setzero_si128 simde_mm_setzero_si128#define _mm_xor_si128 simde_mm_xor_si128#define _mm_movemask_epi8 simde_mm_movemask_epi8#define _mm_cmpeq_epi32 simde_mm_cmpeq_epi32#define _mm_movemask_ps simde_mm_movemask_ps#define _mm_cmpeq_epi64 simde_mm_cmpeq_epi64#define _mm_slli_epi64 simde_mm_slli_epi64#define _mm_cvtsi32_si128 simde_mm_cvtsi32_si128#define _mm_srli_epi64 simde_mm_srli_epi64#define _mm_set1_epi8 simde_mm_set1_epi8#define _mm_set1_epi32 simde_mm_set1_epi32#define _mm_cvtsi128_si32 simde_mm_cvtsi128_si32#define _mm_set_epi64x simde_mm_set_epi64x#define _mm_srli_si128 simde_mm_srli_si128#define _mm_slli_si128 simde_mm_slli_si128#define _mm_extract_epi32 simde_mm_extract_epi32#define _mm_extract_epi64 simde_mm_extract_epi64#define _mm_and_si128 simde_mm_and_si128#define _mm_xor_si128 simde_mm_xor_si128#define _mm_or_si128 simde_mm_or_si128#define _mm_andnot_si128 simde_mm_andnot_si128#define _mm_load_si128 simde_mm_load_si128#define _mm_loadu_si128 simde_mm_loadu_si128#define _mm_storeu_si128 simde_mm_storeu_si128#define _mm_max_epu8 simde_mm_max_epu8#define _mm_min_epu8 simde_mm_min_epu8#define _mm_adds_epu8 simde_mm_adds_epu8#define _mm_sub_epi8 simde_mm_sub_epi8#define _mm_packs_epi16 simde_mm_packs_epi16#define _mm_packs_epi32 simde_mm_packs_epi32#define _mm_castsi128_ps simde_mm_castsi128_ps#define _mm_sll_epi64 simde_mm_sll_epi64#define _mm_shuffle_epi8 simde_mm_shuffle_epi8#define _mm_alignr_epi8 simde_mm_alignr_epi8#define _mm_set1_epi64x simde_mm_set1_epi64x#define _mm_set_epi32 simde_mm_set_epi32#ifdef __cplusplus}#endif#endif
修改simd_types.h
对于 ARM, 定义新的 m128i 类型.
#if defined(ARCH_IA32) || defined(ARCH_X86_64)#if defined(HAVE_SSE2)typedef __m128i m128;#elsetypedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;#endif#if defined(HAVE_AVX2)typedef __m256i m256;#elsetypedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;#endiftypedef struct {m128 lo; m128 mid; m128 hi;} m384;#if defined(HAVE_AVX512)typedef __m512i m512;#elsetypedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;#endif#elif defined(ARCH_ARM32)typedef simde__m128i m128;typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;typedef struct {m128 lo; m128 mid; m128 hi;} m384;typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;#endif
修改simd_utils.h
在 x86 上检查 SIMD 指令的最低依赖 - SSSE3, 而在 ARM 编译时默认支持.
#if defined(ARCH_IA32) || defined(ARCH_X86_64)#if !defined(_WIN32) && !defined(__SSSE3__)#error SSSE3 instructions must be enabled#endif#endif
修改cpuid_flags.h
对 cpuid.h 的检查只在 x86 进行
#if defined(ARCH_IA32) || defined(ARCH_X86_64)#if !defined(_WIN32) && !defined(CPUID_H_)#include <cpuid.h>/* system header doesn't have a header guard */#define CPUID_H_#endif#endif
修改cpuid_inline.h
这里的函数用来检查 cpu 特性支持, 对于 ARM 我们全返回 0 就完事了.
#if defined(ARCH_IA32) || defined(ARCH_X86_64)#if !defined(_WIN32) && !defined(CPUID_H_)#include <cpuid.h>/* system header doesn't have a header guard */#define CPUID_H_#endif#endif#ifdef __cplusplusextern "C"{#endif#if defined(ARCH_IA32) || defined(ARCH_X86_64)static inlinevoid cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,unsigned int *ebx, unsigned int *ecx, unsigned int *edx) {...#elif defined(ARCH_ARM32)static inlineint check_ssse3(void) {return 1;}static inlineint check_sse42(void) {return 0;}static inlineint check_popcnt(void) {return 0;}static inlineint check_avx2(void) {return 0;}static inlineint check_avx512(void) {return 0;}#endif
修改cpuid_flags.c
让原有代码只在 x86 编译时编译, 在 arm 上 cpuid_flags 返回 0, cpuid_tune 返回 HS_TUNE_FAMILY_GENERIC .
#if defined(ARCH_IA32) || defined(ARCH_X86_64)#if !defined(_WIN32) && !defined(CPUID_H_)#include <cpuid.h>#endif...#elif defined(ARCH_ARM32)u64a cpuid_flags(void) {return 0;}u32 cpuid_tune(void) {return HS_TUNE_FAMILY_GENERIC;}#endif
5 编译
设置LD_LIBRARY_PATH环境变量
$export LD_LIBRARY_PATH=/opt/toolchain/crosstools-arm-gcc-5.3-linux-4.1-glibc-2.22-binutils-2.25/usr/lib
设置SYSROOT环境变量
接下来就可以编译了. 编译步骤主要分以下几步
- 在 hyperscan 源代码目录以外的地方建立编译目录, 如 hs_build/
- 进入 hs_build, 运行 cmake, 检查编译环境并生成 Makefile
- 运行
cmake --build .进行编译
下面说的都是第 2 步.
默认编译为静态库
$cmake -DCMAKE_TOOLCHAIN_FILE=../hyperscan/cmake/toolchain/armv7.cmake -DCMAKE_C_FLAGS="-march=armv7-a -mfpu=neon" -DCMAKE_CXX_FLAGS="-march=armv7-a -mfpu=neon" -DBOOST_ROOT=/home/zzq/pkg/boost_1_71_0 ../hyperscan
默认的编译选项是带调试信息的 Release 版本( CMAKE_BUILD_TYPE=RelWithDebInfo ). 静态编译出来的加文件是非常大的:
$du -sh lib/*4.5M lib/libcorpusomatic.a132K lib/libcrosscompileutil.a72K lib/libdatabaseutil.a428K lib/libexpressionutil.a243M lib/libhs.a18M lib/libhs_runtime.a
编译为动态库
$cmake -DCMAKE_TOOLCHAIN_FILE=../hyperscan/cmake/toolchain/armv7.cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=on -DCMAKE_C_FLAGS="-march=armv7-a -mfpu=neon" -DCMAKE_CXX_FLAGS="-march=armv7-a -mfpu=neon" -DBOOST_ROOT=/home/zzq/pkg/boost_1_71_0 ../hyperscan
编译时的小技巧
显示当前平台名称或宏定义gcc -dM -E - < /dev/null | grep ARCH
# ARM$/opt/toolchain/crosstools-arm-gcc-5.3-linux-4.1-glibc-2.22-binutils-2.25/usr/bin/arm-buildroot-linux-gnueabi-gcc -dM -E - < /dev/null | grep ARCH#define __ARM_ARCH_ISA_ARM 1#define __ARM_ARCH_PROFILE 65#define __ARM_ARCH_ISA_THUMB 2#define __ARM_ARCH 7#define __ARM_ARCH_7A__ 1# MIPS$/opt/toolchain/mips-linux-uclibc-4.9.3/usr/bin/mips-buildroot-linux-uclibc-gcc -dM -E - < /dev/null | grep ARCH#define _MIPS_ARCH "mips32r2"#define _MIPS_ARCH_MIPS32R2 1
gcc arm编译选项: https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html
gcc mips编译选项: https://gcc.gnu.org/onlinedocs/gcc/MIPS-Options.html
编译时某些编译器可能会有找不到 math 库里 isnan 的问题, 见https://stackoverflow.com/questions/39130040/cmath-hides-isnan-in-math-h-in-c14-c11/39132787
6 单元测试
hyperscan 自带单元测试(unit/), 默认编译完成后, 会在编译目录的 bin 子目录生成 unit-hyperscan 可执行程序, 可执行它进行单元测试. 运行时间会很长, 在 i5-9400 主机的 Ubuntu14.04 虚拟机上会运行 1 个小时, 才会跑完所有测试用例.
zzq@ubuntu14:~/dev/hs_build/bin$lshscheck* simplegrep* unit-hyperscan*
使用 qemu-arm 运行的示例:
$qemu-arm -L /opt/toolchain/crosstools-arm-gcc-5.3-linux-4.1-glibc-2.22-binutils-2.25/usr/arm-buildroot-linux-gnueabi/sysroot/ -E LD_LIBRARY_PATH=../lib ./unit-hyperscan
运行结果示例:
参考
- GitHub: hyperscan
- GitHub: SIMDe
- hyperscan 文档: Get Started
- cmake doc: cmake toolchains
- fix: CMake: The C Compiler is not able to compile a simple test program
- CMake requires me to manually copy CMAKE_INCLUDE_PATH

本作品采用知识共享署名-非商业性使用-禁止演绎 3.0 未本地化版本许可协议进行许可。
