From a4749e1f5028c75bd9c285a3133a901729bfefa3 Mon Sep 17 00:00:00 2001 From: Jack Zhou Date: Sun, 18 Sep 2022 00:24:14 +0800 Subject: [PATCH] Add ft compile doc and scripts (#3292) * Fix the mac compile * Add cpp, python lib building scripts * Remove cache in cpp lib * Add compile docs --- faster_tokenizer/README.md | 4 ++ faster_tokenizer/docs/compile/README.md | 13 ++++++ .../compile/how_to_build_linux_and_mac.md | 36 ++++++++++++++++ .../docs/compile/how_to_build_windows.md | 42 +++++++++++++++++++ .../faster_tokenizer/CMakeLists.txt | 2 +- faster_tokenizer/run_build_cpp_lib.bat | 7 ++++ faster_tokenizer/run_build_cpp_lib.sh | 21 ++++++++++ faster_tokenizer/run_build_py_lib.bat | 14 +++++++ faster_tokenizer/run_build_py_lib.sh | 35 ++++++++++++++++ 9 files changed, 173 insertions(+), 1 deletion(-) create mode 100644 faster_tokenizer/docs/compile/README.md create mode 100644 faster_tokenizer/docs/compile/how_to_build_linux_and_mac.md create mode 100644 faster_tokenizer/docs/compile/how_to_build_windows.md create mode 100644 faster_tokenizer/run_build_cpp_lib.bat create mode 100644 faster_tokenizer/run_build_cpp_lib.sh create mode 100644 faster_tokenizer/run_build_py_lib.bat create mode 100644 faster_tokenizer/run_build_py_lib.sh diff --git a/faster_tokenizer/README.md b/faster_tokenizer/README.md index 45f2e3149358..6747ff743580 100644 --- a/faster_tokenizer/README.md +++ b/faster_tokenizer/README.md @@ -99,3 +99,7 @@ A:在有三种情况下,打开`use_faster=True`开关可能无法提升性 2. 加载的Tokenizer类型暂不支持Faster版本。目前支持4种Tokenizer的Faster版本,分别是BERT、ERNIE、TinyBERT以及ERNIE-M Tokenizer。若加载不支持Faster版本的Tokenizer情况下打开`use_faster`开关,PaddleNLP会给出以下warning:"The tokenizer XXX doesn't have the faster version. Please check the map paddlenlp.transformers.auto.tokenizer.FASTER_TOKENIZER_MAPPING_NAMES to see which faster tokenizers are currently supported." 3. 待切词文本长度过短(如文本平均长度小于5)。这种情况下切词开销可能不是整个文本预处理的性能瓶颈,导致在使用FasterTokenizer后仍无法提升整体性能。 + +## 相关文档 + +[FasterTokenizer编译指南](docs/compile/README.md) diff --git a/faster_tokenizer/docs/compile/README.md b/faster_tokenizer/docs/compile/README.md new file mode 100644 index 000000000000..d7820884e1f4 --- /dev/null +++ b/faster_tokenizer/docs/compile/README.md @@ -0,0 +1,13 @@ +# FasterTokenizer编译指南 + +本文档说明编译FasterTokenizer C++库、Python库两种编译过程,根据编译的平台参考如下文档 + +- [Linux & Mac 编译](./how_to_build_linux_and_mac.md) +- [Windows编译](./how_to_build_windows.md) + +FasterTokenizer使用CMake编译,其中编译过程中,各平台上编译选项如下表所示 + +| 选项 | 作用 | 备注 | +|:---- | :--- | :--- | +| WITH_PYTHON | 是否编译Python库,默认为是 | +| WITH_TESTING | 是否编译C++单测,默认为否 | diff --git a/faster_tokenizer/docs/compile/how_to_build_linux_and_mac.md b/faster_tokenizer/docs/compile/how_to_build_linux_and_mac.md new file mode 100644 index 000000000000..5dc820525176 --- /dev/null +++ b/faster_tokenizer/docs/compile/how_to_build_linux_and_mac.md @@ -0,0 +1,36 @@ +# Linux & Mac编译 + +## 环境依赖 + +- cmake >= 3.10 +- gcc >= 8.2.0 + +## 编译C++库方法 + +```bash +git clone https://github.com/PaddlePaddle/PaddleNLP.git +cd PaddleNLP/faster_tokenizer +mkdir build & cd build +cmake .. -DWITH_PYTHON=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release +make -j8 +``` + +编译后的C++库在当前目录下的`cpp`目录下。 + +## 编译Python库方法 + +```bash +git clone https://github.com/PaddlePaddle/PaddleNLP.git +cd PaddleNLP/faster_tokenizer +mkdir build & cd build +# 设置Python环境 +export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} +export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH} + +cmake .. -DWITH_PYTHON=ON -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release +make -j8 +``` + +编译后的wheel包即在当前目录下的`dist`目录中 + +更多编译选项说明参考[编译指南](./README.md) diff --git a/faster_tokenizer/docs/compile/how_to_build_windows.md b/faster_tokenizer/docs/compile/how_to_build_windows.md new file mode 100644 index 000000000000..b7b73bc7834b --- /dev/null +++ b/faster_tokenizer/docs/compile/how_to_build_windows.md @@ -0,0 +1,42 @@ +# Windows 编译 + +## 环境依赖 + +- cmake >= 3.10 +- VS 2019 +- ninja +- cmake >= 3.10 + +以上依赖安装好后,在Windows菜单打开`x64 Native Tools Command Prompt for VS 2019`命令工具即可进行下面的编译环节。 + +## 编译C++库方法 + +```bash +git clone https://github.com/PaddlePaddle/PaddleNLP.git +cd PaddleNLP/faster_tokenizer +mkdir build & cd build +cmake .. -G "Ninja" -DWITH_PYTHON=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release +ninja -j8 +``` + +编译后的C++库在当前目录下的`cpp`目录下。 + +## 编译Python库方法 + +```bash +git clone https://github.com/PaddlePaddle/PaddleNLP.git +cd PaddleNLP/faster_tokenizer +mkdir build & cd build +# 需要指定Python库 +cmake .. -G "Ninja" -DWITH_PYTHON=ON ^ + -DWITH_TESTING=OFF ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DPYTHON_EXECUTABLE=C:\Python37\python.exe ^ + -DPYTHON_INCLUDE_DIR=C:\Python37\include ^ + -DPYTHON_LIBRARY=C:\Python37\libs\python3%%x.lib +ninja -j8 +``` + +编译后的wheel包即在当前目录下的`dist`目录中 + +更多编译选项说明参考[编译指南](./README.md) diff --git a/faster_tokenizer/faster_tokenizer/CMakeLists.txt b/faster_tokenizer/faster_tokenizer/CMakeLists.txt index 2fea6d18643a..cf4abb40395d 100644 --- a/faster_tokenizer/faster_tokenizer/CMakeLists.txt +++ b/faster_tokenizer/faster_tokenizer/CMakeLists.txt @@ -6,7 +6,7 @@ add_subdirectory(postprocessors) add_subdirectory(core) add_subdirectory(utils) # set the relative path of shared library -if (UNIX) +if (NOT APPLE AND NOT WIN32) set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'") endif() diff --git a/faster_tokenizer/run_build_cpp_lib.bat b/faster_tokenizer/run_build_cpp_lib.bat new file mode 100644 index 000000000000..faf396a27be5 --- /dev/null +++ b/faster_tokenizer/run_build_cpp_lib.bat @@ -0,0 +1,7 @@ +if not exist build_cpp mkdir build_cpp +cd build_cpp +for /d %%G in ("*") do rmdir /s /q "%%G" +del /q * +cmake .. -G "Ninja" -DWITH_PYTHON=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release +ninja -j20 +cd .. \ No newline at end of file diff --git a/faster_tokenizer/run_build_cpp_lib.sh b/faster_tokenizer/run_build_cpp_lib.sh new file mode 100644 index 000000000000..0d8e9b8bf67d --- /dev/null +++ b/faster_tokenizer/run_build_cpp_lib.sh @@ -0,0 +1,21 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Can be used in linux and mac +mkdir -p build_cpp +cd build_cpp +rm -rf * +cmake .. -DWITH_PYTHON=ON -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release +make -j48 +cd .. \ No newline at end of file diff --git a/faster_tokenizer/run_build_py_lib.bat b/faster_tokenizer/run_build_py_lib.bat new file mode 100644 index 000000000000..1934162581cc --- /dev/null +++ b/faster_tokenizer/run_build_py_lib.bat @@ -0,0 +1,14 @@ +for %%x in (6 7 8 9) do ( + if not exist build_py3%%x mkdir build_py3%%x + cd build_py3%%x + for /d %%G in ("*") do rmdir /s /q "%%G" + del /q * + cmake .. -G "Ninja" -DWITH_PYTHON=ON ^ + -DWITH_TESTING=OFF ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DPYTHON_EXECUTABLE=C:\Python3%%x\python.exe ^ + -DPYTHON_INCLUDE_DIR=C:\Python3%%x\include ^ + -DPYTHON_LIBRARY=C:\Python3%%x\libs\python3%%x.lib + ninja -j20 + cd .. +) diff --git a/faster_tokenizer/run_build_py_lib.sh b/faster_tokenizer/run_build_py_lib.sh new file mode 100644 index 000000000000..c6d61e6257b1 --- /dev/null +++ b/faster_tokenizer/run_build_py_lib.sh @@ -0,0 +1,35 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Can be used in linux and mac +# build python lib +mkdir -p build_py36 build_py37 build_py38 build_py39 +for py_version in 6 7 8 9; +do + cd build_py3${py_version} + rm -rf * + platform="$(uname -s)" + if [[ $platform == Linux* ]]; + then + export LD_LIBRARY_PATH=/opt/_internal/cpython-3.${py_version}.0/lib/:${LD_LIBRARY_PATH} + export PATH=/opt/_internal/cpython-3.${py_version}.0/bin/:${PATH} + else + export LD_LIBRARY_PATH=/Users/paddle/miniconda2/envs/py3${py_version}/lib/:${LD_LIBRARY_PATH} + export PATH=/Users/paddle/miniconda2/envs/py3${py_version}/bin/:${PATH} + fi + cmake .. -DWITH_PYTHON=ON -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release + make -j24 + cd .. +done +