1 思路
由于 ARMv7 开始支持 Neon 指令集, 因此可以把 SSE 指令翻译成 Neon 指令. 有一些开源项目已经支持这种”指令翻译”, 比如 sse2neon, simde 等. 目前来看 simde 支持的最好.
当然这种思路也有限制, 只支持小端字节序的平台. 如果要移植到大端字节序平台, 则需要改动 hyperscan 大量源码… 几乎是不可能的任务, 太难维护了.
我移植的代码见 github.com/zzqcn/hyperscan/ 中的 porting 分支.
2 编译依赖项
需要先安装:
- 交叉编译工具链(toolchain) 本文假设放在
/opt/toolchain
- cmake
- ragel
- Boost库 下载压缩包后解压即可, 不需要编译
- SIMDe 本文假设安装在
/dev/zzq/dev/simde
3 配置cmake编译环境
准备toolchain file
在 cmake 下面建立 toolchain 子目录, 编写 armv7.cmake 文件: ```makefileARMv7
set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_SYSTEM_PROCESSOR arm)
set(TOOLCHAIN_DIR /opt/toolchain/crosstools-arm-gcc-5.3-linux-4.1-glibc-2.22-binutils-2.25) set(TOOLCHAIN_INCLUDE ${TOOLCHAIN_DIR}/usr/include ${TOOLCHAIN_DIR}/usr/arm-buildroot-linux-gnueabi/sysroot/include ) set(TOOLCHAIN_LIB ${TOOLCHAIN_DIR}/usr/lib )
set(CMAKE_SYSROOT ${TOOLCHAIN_DIR}/usr/arm-buildroot-linux-gnueabi/sysroot)
set(CMAKE_C_COMPILER ${TOOLCHAIN_DIR}/usr/bin/arm-buildroot-linux-gnueabi-gcc) set(CMAKE_CXX_COMPILER ${TOOLCHAIN_DIR}/usr/bin/arm-buildroot-linux-gnueabi-g++)
set(CMAKE_C_COMPILER_WORKS 1) set(CMAKE_CXX_COMPILER_WORKS 1)
set(CMAKE_C_FLAGS “${CMAKE_C_FLAGS} -march=armv7-a -mfpu=neon”)
set(CMAKE_CXX_FLAGS “${CMAKE_CXX_FLAGS} -march=armv7-a -mfpu=neon”)
set(CMAKE_FIND_ROOT_PATH ${TOOLCHAIN_DIR}/usr/arm-buildroot-linux-gnueabi)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
set(SIMDE_INCLUDE /home/zzq/dev/simde/simde CACHE PATH “SIMDe include directory”)
include_directories( ${TOOLCHAIN_DIR}/usr/arm-buildroot-linux-gnueabi/sysroot/include ${TOOLCHAIN_DIR}/usr/include ${SIMDE_INCLUDE} )
具体细节请参考 [https://cmake.org/cmake/help/latest/manual/cmake-toolchains.7.html](https://cmake.org/cmake/help/latest/manual/cmake-toolchains.7.html)
<a name="hZ5i5"></a>
## 修改CMakeList.txt
屏蔽掉编译时会出错的行.<br />一处是会检查 `-march` , `-mtune` , 把这里设置为在x86平台编译时才检查:
```makefile
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^x86")
# arg1 might exist if using ccache
string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native)
execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
OUTPUT_VARIABLE _GCC_OUTPUT)
string(REGEX REPLACE ".*march=[ \t]*([^ \n]*)[ \n].*" "\\1"
GNUCC_ARCH "${_GCC_OUTPUT}")
# test the parsed flag
set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH})
execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
OUTPUT_QUIET ERROR_QUIET
INPUT_FILE /dev/null
RESULT_VARIABLE GNUCC_TUNE_TEST)
if (NOT GNUCC_TUNE_TEST EQUAL 0)
message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid")
endif()
endif()
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^x86")
是判断当前平台是不是 x86 的一个技巧, 下文会经常用到.
第二处是检查 x86 intrinsics 头文件, 这里也改为只在 x86 上检查:
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^x86")
CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
endif()
修改cmake/arch.cmake
这个文件会检查 x86 指令集可用性, 同样设置为只在 x86 上检查, 而在 ARM 平台假装支持 SSSE3.
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^x86")
if (HAVE_C_X86INTRIN_H)
set (INTRIN_INC_H "x86intrin.h")
...
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
set(HAVE_SSSE3 TRUE)
else()
message(FATAL_ERROR "don't support processor " ${CMAKE_SYSTEM_PROCESSOR})
endif()
修改cmake/platform.cmake
这个文件会检查当前平台, 改为在 x86 上检查, 而在 ARM 默认设置为 ARM32 位, 这里的设置最终会影响 config.h 的输出.
# determine the target arch
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^x86")
# really only interested in the preprocessor here
CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_64_BIT)
CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT)
set(ARCH_X86_64 ${ARCH_64_BIT})
set(ARCH_IA32 ${ARCH_32_BIT})
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
# TODO: XXX
set(ARCH_32_BIT TRUE)
set(ARCH_ARM32 TRUE)
else()
message(FATAL_ERROR "don't support processor " ${CMAKE_SYSTEM_PROCESSOR})
endif()
修改config.h.in
添加
#cmakedefine ARCH_ARM32
这个文件相当于一个模板, 会根据 cmake 文件的判定结果生成最终的 config.h. 对于我们的 ARMv7 移植, config.h 中会生成 #define ARCH_ARM32
.
其他cmake小技巧
进行正则匹配, 检查当前平台名称
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^x86")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
set(HAVE_SSSE3 TRUE)
else()
输出状态信息或错误信息:
message(STATUS "XXXXXXX")
message(FATAL_ERROR "XXXXXXXXXXX")
在 cmake 文件里控制 config.h.in 文件里的宏生成:
# cmake:
set(ARCH_32_BIT TRUE)
set(ARCH_ARM32 TRUE)
# config.h.in:
#cmakedefine ARCH_32_BIT
#cmakedefine ARCH_ARM32
打印 include 路径
get_property(dirs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
foreach(dir ${dirs})
message(STATUS "dir='${dir}'")
endforeach()
4 修改hyperscan源码
修改arch.h
添加:
#if defined(__arm__)
#define NO_ASM
#endif
修改intrinsics.h
把原有代码包含在 x86 的判断条件里, 而 arm 时包含 sse_helper.h.
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
#ifdef __cplusplus
# if defined(HAVE_CXX_X86INTRIN_H)
# define USE_X86INTRIN_H
...
#elif defined(ARCH_ARM32)
#include "sse_helper.h"
#endif
这个 sse_helper.h 是我们新添加的代码, 是移植的关键!
添加sse_helper.h
很简单, 把 hyperscan 用到的 SSSE 和之前的指令翻译成 SIMDe 的函数:
#ifndef SSE_HELPER_H
#define SSE_HELPER_H
#include "x86/ssse3.h"
#ifdef __cplusplus
extern "C" {
#endif
#define _mm_cmpeq_epi8 simde_mm_cmpeq_epi8
#define _mm_setzero_si128 simde_mm_setzero_si128
#define _mm_xor_si128 simde_mm_xor_si128
#define _mm_movemask_epi8 simde_mm_movemask_epi8
#define _mm_cmpeq_epi32 simde_mm_cmpeq_epi32
#define _mm_movemask_ps simde_mm_movemask_ps
#define _mm_cmpeq_epi64 simde_mm_cmpeq_epi64
#define _mm_slli_epi64 simde_mm_slli_epi64
#define _mm_cvtsi32_si128 simde_mm_cvtsi32_si128
#define _mm_srli_epi64 simde_mm_srli_epi64
#define _mm_set1_epi8 simde_mm_set1_epi8
#define _mm_set1_epi32 simde_mm_set1_epi32
#define _mm_cvtsi128_si32 simde_mm_cvtsi128_si32
#define _mm_set_epi64x simde_mm_set_epi64x
#define _mm_srli_si128 simde_mm_srli_si128
#define _mm_slli_si128 simde_mm_slli_si128
#define _mm_extract_epi32 simde_mm_extract_epi32
#define _mm_extract_epi64 simde_mm_extract_epi64
#define _mm_and_si128 simde_mm_and_si128
#define _mm_xor_si128 simde_mm_xor_si128
#define _mm_or_si128 simde_mm_or_si128
#define _mm_andnot_si128 simde_mm_andnot_si128
#define _mm_load_si128 simde_mm_load_si128
#define _mm_loadu_si128 simde_mm_loadu_si128
#define _mm_storeu_si128 simde_mm_storeu_si128
#define _mm_max_epu8 simde_mm_max_epu8
#define _mm_min_epu8 simde_mm_min_epu8
#define _mm_adds_epu8 simde_mm_adds_epu8
#define _mm_sub_epi8 simde_mm_sub_epi8
#define _mm_packs_epi16 simde_mm_packs_epi16
#define _mm_packs_epi32 simde_mm_packs_epi32
#define _mm_castsi128_ps simde_mm_castsi128_ps
#define _mm_sll_epi64 simde_mm_sll_epi64
#define _mm_shuffle_epi8 simde_mm_shuffle_epi8
#define _mm_alignr_epi8 simde_mm_alignr_epi8
#define _mm_set1_epi64x simde_mm_set1_epi64x
#define _mm_set_epi32 simde_mm_set_epi32
#ifdef __cplusplus
}
#endif
#endif
修改simd_types.h
对于 ARM, 定义新的 m128i 类型.
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
#if defined(HAVE_SSE2)
typedef __m128i m128;
#else
typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
#endif
#if defined(HAVE_AVX2)
typedef __m256i m256;
#else
typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
#endif
typedef struct {m128 lo; m128 mid; m128 hi;} m384;
#if defined(HAVE_AVX512)
typedef __m512i m512;
#else
typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;
#endif
#elif defined(ARCH_ARM32)
typedef simde__m128i m128;
typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
typedef struct {m128 lo; m128 mid; m128 hi;} m384;
typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;
#endif
修改simd_utils.h
在 x86 上检查 SIMD 指令的最低依赖 - SSSE3, 而在 ARM 编译时默认支持.
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
#if !defined(_WIN32) && !defined(__SSSE3__)
#error SSSE3 instructions must be enabled
#endif
#endif
修改cpuid_flags.h
对 cpuid.h 的检查只在 x86 进行
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
#if !defined(_WIN32) && !defined(CPUID_H_)
#include <cpuid.h>
/* system header doesn't have a header guard */
#define CPUID_H_
#endif
#endif
修改cpuid_inline.h
这里的函数用来检查 cpu 特性支持, 对于 ARM 我们全返回 0 就完事了.
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
#if !defined(_WIN32) && !defined(CPUID_H_)
#include <cpuid.h>
/* system header doesn't have a header guard */
#define CPUID_H_
#endif
#endif
#ifdef __cplusplus
extern "C"
{
#endif
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
static inline
void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
unsigned int *ebx, unsigned int *ecx, unsigned int *edx) {
...
#elif defined(ARCH_ARM32)
static inline
int check_ssse3(void) {
return 1;
}
static inline
int check_sse42(void) {
return 0;
}
static inline
int check_popcnt(void) {
return 0;
}
static inline
int check_avx2(void) {
return 0;
}
static inline
int check_avx512(void) {
return 0;
}
#endif
修改cpuid_flags.c
让原有代码只在 x86 编译时编译, 在 arm 上 cpuid_flags 返回 0, cpuid_tune 返回 HS_TUNE_FAMILY_GENERIC
.
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
#if !defined(_WIN32) && !defined(CPUID_H_)
#include <cpuid.h>
#endif
...
#elif defined(ARCH_ARM32)
u64a cpuid_flags(void) {
return 0;
}
u32 cpuid_tune(void) {
return HS_TUNE_FAMILY_GENERIC;
}
#endif
5 编译
设置LD_LIBRARY_PATH环境变量
$export LD_LIBRARY_PATH=/opt/toolchain/crosstools-arm-gcc-5.3-linux-4.1-glibc-2.22-binutils-2.25/usr/lib
设置SYSROOT环境变量
接下来就可以编译了. 编译步骤主要分以下几步
- 在 hyperscan 源代码目录以外的地方建立编译目录, 如 hs_build/
- 进入 hs_build, 运行 cmake, 检查编译环境并生成 Makefile
- 运行
cmake --build .
进行编译
下面说的都是第 2 步.
默认编译为静态库
$cmake -DCMAKE_TOOLCHAIN_FILE=../hyperscan/cmake/toolchain/armv7.cmake -DCMAKE_C_FLAGS="-march=armv7-a -mfpu=neon" -DCMAKE_CXX_FLAGS="-march=armv7-a -mfpu=neon" -DBOOST_ROOT=/home/zzq/pkg/boost_1_71_0 ../hyperscan
默认的编译选项是带调试信息的 Release 版本( CMAKE_BUILD_TYPE=RelWithDebInfo
). 静态编译出来的加文件是非常大的:
$du -sh lib/*
4.5M lib/libcorpusomatic.a
132K lib/libcrosscompileutil.a
72K lib/libdatabaseutil.a
428K lib/libexpressionutil.a
243M lib/libhs.a
18M lib/libhs_runtime.a
编译为动态库
$cmake -DCMAKE_TOOLCHAIN_FILE=../hyperscan/cmake/toolchain/armv7.cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=on -DCMAKE_C_FLAGS="-march=armv7-a -mfpu=neon" -DCMAKE_CXX_FLAGS="-march=armv7-a -mfpu=neon" -DBOOST_ROOT=/home/zzq/pkg/boost_1_71_0 ../hyperscan
编译时的小技巧
显示当前平台名称或宏定义gcc -dM -E - < /dev/null | grep ARCH
# ARM
$/opt/toolchain/crosstools-arm-gcc-5.3-linux-4.1-glibc-2.22-binutils-2.25/usr/bin/arm-buildroot-linux-gnueabi-gcc -dM -E - < /dev/null | grep ARCH
#define __ARM_ARCH_ISA_ARM 1
#define __ARM_ARCH_PROFILE 65
#define __ARM_ARCH_ISA_THUMB 2
#define __ARM_ARCH 7
#define __ARM_ARCH_7A__ 1
# MIPS
$/opt/toolchain/mips-linux-uclibc-4.9.3/usr/bin/mips-buildroot-linux-uclibc-gcc -dM -E - < /dev/null | grep ARCH
#define _MIPS_ARCH "mips32r2"
#define _MIPS_ARCH_MIPS32R2 1
gcc arm编译选项: https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html
gcc mips编译选项: https://gcc.gnu.org/onlinedocs/gcc/MIPS-Options.html
编译时某些编译器可能会有找不到 math 库里 isnan 的问题, 见https://stackoverflow.com/questions/39130040/cmath-hides-isnan-in-math-h-in-c14-c11/39132787
6 单元测试
hyperscan 自带单元测试(unit/), 默认编译完成后, 会在编译目录的 bin 子目录生成 unit-hyperscan 可执行程序, 可执行它进行单元测试. 运行时间会很长, 在 i5-9400 主机的 Ubuntu14.04 虚拟机上会运行 1 个小时, 才会跑完所有测试用例.
zzq@ubuntu14:~/dev/hs_build/bin
$ls
hscheck* simplegrep* unit-hyperscan*
使用 qemu-arm 运行的示例:
$qemu-arm -L /opt/toolchain/crosstools-arm-gcc-5.3-linux-4.1-glibc-2.22-binutils-2.25/usr/arm-buildroot-linux-gnueabi/sysroot/ -E LD_LIBRARY_PATH=../lib ./unit-hyperscan
运行结果示例:
参考
- GitHub: hyperscan
- GitHub: SIMDe
- hyperscan 文档: Get Started
- cmake doc: cmake toolchains
- fix: CMake: The C Compiler is not able to compile a simple test program
- CMake requires me to manually copy CMAKE_INCLUDE_PATH
本作品采用知识共享署名-非商业性使用-禁止演绎 3.0 未本地化版本许可协议进行许可。