diff --git a/README.md b/README.md index 010411c..9a7a0c0 100644 --- a/README.md +++ b/README.md @@ -169,19 +169,19 @@ ctest --test-dir ./build Details of configurable build options are given below: -Flag name | Default | Values | Description | -|---|---|----------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------| -| LLM_FRAMEWORK | llama.cpp | llama.cpp / mediapipe / onnxruntime-genai / mnn | Specifies the backend framework to be used. | -| BUILD_DEBUG | OFF | ON/OFF | If set to ON a debug build is configured. | -| ENABLE_STREAMLINE | OFF | ON/OFF | Enables Arm Streamline timeline annotations for analyzing LLM initialization, encode, decode, and control-path performance. | -| BUILD_LLM_TESTING | ON | ON/OFF | Builds the project's functional tests when ON. | -| BUILD_BENCHMARK | OFF | ON/OFF | Builds the framework's benchmark binaries and arm-llm-bench-cli for the project when ON. | -| BUILD_JNI_LIB| ON | ON/OFF | Builds the JNI bindings for the project. | -| LOG_LEVEL | INFO/DEBUG | DEBUG, INFO, WARN & ERROR | For BUILD_DEBUG=OFF the default value is INFO. For BUILD_DEBUG=ON, the default value is DEBUG. | -| USE_KLEIDIAI | ON | ON/OFF | Build the project with KLEIDIAI CPU optimizations; if set to OFF, optimizations are turned off. | -| CPU_ARCH | Not defined | Armv8.2_1, Armv8.2_2, Armv8.2_3, Armv8.2_4, Armv8.2_5, Armv8.6_1, Armv9.0_1_1, armv9.2_1_1, armv9.2_2_1 | Sets the target ISA architecture (AArch64) to ensure SVE is not enabled when LLM_FRAMEWORK=llama.cpp (issue affects aarch64 only). | -| GGML_METAL | OFF | ON/OFF | macOS specific. Enables Apple Metal backend in ggml for GPU acceleration (Apple Silicon only). | -| GGML_BLAS | OFF | ON/OFF | macOS specific. Enables Accelerate/BLAS backend in ggml for CPU-optimized linear algebra kernels. | +Flag name | Default | Values | Description | +|---|---|------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------| +| LLM_FRAMEWORK | llama.cpp | llama.cpp / mediapipe / onnxruntime-genai / mnn | Specifies the backend framework to be used. | +| BUILD_DEBUG | OFF | ON/OFF | If set to ON a debug build is configured. | +| ENABLE_STREAMLINE | OFF | ON/OFF | Enables Arm Streamline timeline annotations for analyzing LLM initialization, encode, decode, and control-path performance. | +| BUILD_LLM_TESTING | ON | ON/OFF | Builds the project's functional tests when ON. | +| BUILD_BENCHMARK | OFF | ON/OFF | Builds the framework's benchmark binaries and arm-llm-bench-cli for the project when ON. | +| BUILD_JNI_LIB| ON | ON/OFF | Builds the JNI bindings for the project. | +| LOG_LEVEL | INFO/DEBUG | DEBUG, INFO, WARN & ERROR | For BUILD_DEBUG=OFF the default value is INFO. For BUILD_DEBUG=ON, the default value is DEBUG. | +| USE_KLEIDIAI | ON | ON/OFF | Build the project with KLEIDIAI CPU optimizations; if set to OFF, optimizations are turned off. | +| CPU_ARCH | Not defined | Armv8.2_1, Armv8.2_2, Armv8.2_3, Armv8.2_4, Armv8.6_1, Armv9.2_1 | Sets the target ISA architecture (AArch64). Choose a nosve preset to keep SVE disabled when LLM_FRAMEWORK=llama.cpp (issue affects aarch64 only). | +| GGML_METAL | OFF | ON/OFF | macOS specific. Enables Apple Metal backend in ggml for GPU acceleration (Apple Silicon only). | +| GGML_BLAS | OFF | ON/OFF | macOS specific. Enables Accelerate/BLAS backend in ggml for CPU-optimized linear algebra kernels. | - `DOWNLOADS_LOCK_TIMEOUT`: A timeout value in seconds indicating how much time a lock should be tried for when downloading resources. This is a one-time download that CMake configuration will initiate unless it @@ -258,28 +258,25 @@ When targeting the llama.cpp LLM backend and Android (--preset=x-android-aarch64 ## Known Issue with llama.cpp -Currently there are issues with a specific architecture (SVE) integration in llama.cpp backend on aarch64. To ensure this feature is not enabled we enforce using one of our provided `CPU_ARCH` flag presets -that ensure compiler flags do not enable SVE at build time. -The table below gives the mapping of our preset CPU_ARCH flags to some common CPU feature flag sets. -Other permutations are also supported and can be tailored accordingly. If you intend to use specific features you must ensure your specific CPU implements them e.g. i8mm as this was -optional in v8.2 for example. Compilers also need to support any chosen features. +Currently there are issues with a specific architecture (SVE) integration in the llama.cpp backend on aarch64. To ensure this feature is not enabled, we enforce use of one of our provided `CPU_ARCH` flag presets that ensure compiler flags do not enable SVE at build time. +See [llama.cpp issues on GitHub](https://github.com/ggml-org/llama.cpp/issues/21548) for more information. +The table below gives the mapping of our preset `CPU_ARCH` flags to some common CPU feature flag sets. Other permutations are also supported and can be tailored accordingly. If you intend to use specific features, you must ensure your specific CPU implements them; for example, `i8mm` was optional in Armv8.2. Compilers also need to support any chosen features. + + -| CPU_ARCH | C/C++ compiler flags | -|--------------|----------------------------------------------| -| Armv8.2_1 | -march=armv8.2-a+dotprod | -| Armv8.2_2 | -march=armv8.2-a+dotprod+fp16 | -| Armv8.2_3 | -march=armv8.2-a+dotprod+fp16+sve | -| Armv8.2_4 | -march=armv8.2-a+dotprod+i8mm | -| Armv8.2_5 | -march=armv8.2-a+dotprod+i8mm+sve+sme | -| Armv8.6_1 | -march=armv8.6-a+dotprod+fp16+i8mm | -| Armv9.0_1_1 | -march=armv8.6-a+dotprod+fp16+i8mm+nosve | -| *armv9.2_1_1 | -march=armv9.2-a+dotprod+fp16+nosve+i8mm+sme | -| *armv9.2_2_1 | -march=armv9.2-a+dotprod+fp16+nosve+i8mm+sme | +| CPU_ARCH | C/C++ compiler flags | +|-----------|-----------------------------------------------| +| Armv8.2_1 | -march=armv8.2-a+dotprod | +| Armv8.2_2 | -march=armv8.2-a+dotprod+fp16 | +| Armv8.2_3 | -march=armv8.2-a+dotprod+fp16+i8mm+sme | +| Armv8.2_4 | -march=armv8.2-a+dotprod+i8mm | +| Armv8.6_1 | -march=armv8.6-a+dotprod+fp16+i8mm | +| Armv9.2_1* | -march=armv9.2-a+dotprod+fp16++i8mm+nosve+sme | -* Note: Different capitalisation for v9.2 presets. +* Armv9.2_1 is edited from ggml armv9.2 preset to keep SVE disabled. > **NOTE**: If you need specific version of Java set the path in `JAVA_HOME` environment variable. > ```shell diff --git a/scripts/cmake/toolchains/aarch64-base.cmake b/scripts/cmake/toolchains/aarch64-base.cmake index b00398e..2ad78f6 100644 --- a/scripts/cmake/toolchains/aarch64-base.cmake +++ b/scripts/cmake/toolchains/aarch64-base.cmake @@ -6,62 +6,77 @@ include_guard(GLOBAL) -if("${LLM_FRAMEWORK}" STREQUAL "llama.cpp" - AND "${TARGET_PLATFORM}" STREQUAL "linux-aarch64") +if("${LLM_FRAMEWORK}" STREQUAL "llama.cpp" + AND "${TARGET_PLATFORM}" STREQUAL "linux-aarch64") + + # Following block of code determines CMAKE_C_FLAGS / CMAKE_CXX_FLAGS to be used. + # Preset names mirror upstream ggml Linux/Android CPU presets; suffixes encode + # incremental feature bundles for Armv8.x/Armv9.x. Some mappings are adjusted + # to keep SVE disabled due to the known upstream issue. + # Source:https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/CMakeLists.txt#L390 - # Following block of code determines CMAKE_C_FLAGS / CMAKE_CXX_FLAGS to be used set(_allowed_arches - Armv8.2_1 - Armv8.2_2 - Armv8.2_3 - Armv8.2_4 - Armv8.2_5 - Armv8.6_1 - Armv9.0_1_1 - Armv9.2_1_1 - Armv9.2_2_1 - ) + Armv8.2_1 + Armv8.2_2 + Armv8.2_3 + Armv8.2_4 + Armv8.6_1 + Armv9.2_1) if(NOT DEFINED CPU_ARCH) list(JOIN _allowed_arches ", " _allowed_str) message(FATAL_ERROR - "CPU_ARCH is required but not set. Allowed values: ${_allowed_str}.") + "CPU_ARCH is required to avoid enabling SVE in llama.cpp. " + "Allowed values: ${_allowed_str}.") endif() list(FIND _allowed_arches "${CPU_ARCH}" _idx) if(_idx EQUAL -1) list(JOIN _allowed_arches ", " _allowed_str) message(FATAL_ERROR - "Invalid CPU_ARCH='${CPU_ARCH}'. Allowed values: ${_allowed_str}.") + "Invalid CPU_ARCH='${CPU_ARCH}'. CPU_ARCH is required to avoid enabling " + "SVE in llama.cpp. Allowed values: ${_allowed_str}.") endif() if(CPU_ARCH STREQUAL "Armv8.2_1") + # ggml armv8.2_1 preset: DOTPROD only. set(_march "armv8.2-a+dotprod") elseif(CPU_ARCH STREQUAL "Armv8.2_2") + # ggml armv8.2_2 preset: DOTPROD + FP16. set(_march "armv8.2-a+dotprod+fp16") elseif(CPU_ARCH STREQUAL "Armv8.2_3") - set(_march "armv8.2-a+dotprod+fp16+sve") - elseif(CPU_ARCH STREQUAL "Armv8.2_4") + # Custom preset: diverges from ggml's armv8.2_3 by removing SVE + # Use this variant to expose the int8 matrix-multiply extension (+i8mm) set(_march "armv8.2-a+dotprod+i8mm") - elseif(CPU_ARCH STREQUAL "Armv8.2_5") - set(_march "armv8.2-a+dotprod+i8mm+sve+sme") + elseif(CPU_ARCH STREQUAL "Armv8.2_4") + # Custom preset: diverges from ggml's preset ladder. + # This keeps SVE disabled while exposing both FP16 and I8MM, which is useful + # for KleidiAI / quantized inference paths. + set(_march "armv8.2-a+dotprod+fp16+i8mm") elseif(CPU_ARCH STREQUAL "Armv8.6_1") + # Custom preset: Based on ggml's armv8.6_1 preset with SVE disabled set(_march "armv8.6-a+dotprod+fp16+i8mm") - elseif(CPU_ARCH STREQUAL "Armv9.0_1_1") - set(_march "armv8.6-a+dotprod+fp16+i8mm+nosve") - elseif(CPU_ARCH STREQUAL "armv9.2_1_1") - set(_march "armv9.2-a+dotprod+fp16+nosve+i8mm+sme") - elseif(CPU_ARCH STREQUAL "armv9.2_2_1") + elseif(CPU_ARCH STREQUAL "Armv9.2_1") + # Custom preset: Based on ggml's armv9.2_1 preset with SVE disabled set(_march "armv9.2-a+dotprod+fp16+nosve+i8mm+sme") else() - list(JOIN _allowed_arches ", " _allowed_str) + list(JOIN _allowed_arches ", " _allowed_str) message(FATAL_ERROR - "CPU_ARCH is set to an invalid value. Allowed values: ${_allowed_str}.") + "CPU_ARCH is set to an invalid value. CPU_ARCH is required to avoid " + "enabling SVE in llama.cpp on this backend. Allowed values: ${_allowed_str}.") endif() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${_march}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${_march}") + # Avoid stacking multiple -march flags if one is already defined. + foreach(_flags_var CMAKE_C_FLAGS CMAKE_CXX_FLAGS) + string(REGEX REPLACE "(^| )-march=[^ ]+" "" _cleaned_flags "${${_flags_var}}") + string(STRIP "${_cleaned_flags}" _cleaned_flags) + if(_cleaned_flags STREQUAL "") + set(${_flags_var} "-march=${_march}") + else() + set(${_flags_var} "${_cleaned_flags} -march=${_march}") + endif() + endforeach() message(STATUS "CPU_ARCH=${CPU_ARCH} -> -march=${_march}") -endif() \ No newline at end of file +endif() diff --git a/scripts/py/requirements.json b/scripts/py/requirements.json index 18e1153..64cea89 100644 --- a/scripts/py/requirements.json +++ b/scripts/py/requirements.json @@ -32,7 +32,7 @@ "purpose": "To enable basic testing", "destination": "llama.cpp/qwen2vl-2b/qwen2vl-2b_Q4_0.gguf", "url": "https://huggingface.co/bartowski/Qwen2-VL-2B-Instruct-GGUF/resolve/main/Qwen2-VL-2B-Instruct-Q4_0.gguf?download=true", - "sha256sum": "5745685d2e607a82a0696c1118e56a2a1ae0901da450fd9cd4f161c6b62867d7" + "sha256sum": "4154d891fd4ea54ce368c52031588a50493fa4c118faefb0ac23a9bfbc36c098" } ] },