commit
17da929b82
File diff suppressed because one or more lines are too long
@ -1 +1 @@
|
||||
Subproject commit c460176523d039c8995f1d71089753725ebc0792
|
||||
Subproject commit df57a6cf9450e347d1854687d1fe66a420ee3b35
|
@ -0,0 +1,14 @@
|
||||
mindspore_add_pkg(absl
|
||||
VER 20200225.2
|
||||
LIBS absl_strings absl_throw_delegate absl_raw_logging_internal absl_int128 absl_bad_optional_access
|
||||
URL https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz
|
||||
MD5 73f2b6e72f1599a9139170c29482ddc4
|
||||
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=TRUE)
|
||||
|
||||
include_directories(${absl_INC})
|
||||
|
||||
add_library(mindspore::absl_strings ALIAS absl::absl_strings)
|
||||
add_library(mindspore::absl_throw_delegate ALIAS absl::absl_throw_delegate)
|
||||
add_library(mindspore::absl_raw_logging_internal ALIAS absl::absl_raw_logging_internal)
|
||||
add_library(mindspore::absl_int128 ALIAS absl::absl_int128)
|
||||
add_library(mindspore::absl_bad_optional_access ALIAS absl::absl_bad_optional_access)
|
@ -0,0 +1,12 @@
|
||||
mindspore_add_pkg(c-ares
|
||||
VER 1.15.0
|
||||
LIBS cares
|
||||
URL https://github.com/c-ares/c-ares/releases/download/cares-1_15_0/c-ares-1.15.0.tar.gz
|
||||
MD5 d2391da274653f7643270623e822dff7
|
||||
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release
|
||||
-DCARES_SHARED:BOOL=OFF
|
||||
-DCARES_STATIC:BOOL=ON
|
||||
-DCARES_STATIC_PIC:BOOL=ON)
|
||||
|
||||
include_directories(${c-ares_INC})
|
||||
add_library(mindspore::cares ALIAS c-ares::cares)
|
@ -0,0 +1,110 @@
|
||||
set(grpc_USE_STATIC_LIBS ON)
|
||||
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
|
||||
set(grpc_CXXFLAGS "-fstack-protector-all -Wno-uninitialized -Wno-unused-parameter -fPIC -fvisibility=hidden -D_FORTIFY_SOURCE=2 -O2")
|
||||
elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
|
||||
set(grpc_CXXFLAGS "-fstack-protector-all -Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fvisibility=hidden -D_FORTIFY_SOURCE=2 -O2")
|
||||
else()
|
||||
set(grpc_CXXFLAGS "-fstack-protector-all -Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fvisibility=hidden -D_FORTIFY_SOURCE=2 -D_GLIBCXX_USE_CXX11_ABI=0 -O2")
|
||||
endif()
|
||||
|
||||
set(grpc_LDFLAGS "-Wl,-z,relro,-z,now,-z,noexecstack")
|
||||
|
||||
|
||||
if (EXISTS ${protobuf_ROOT}/lib64)
|
||||
set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${protobuf_ROOT}/lib64/cmake/protobuf")
|
||||
else()
|
||||
set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${protobuf_ROOT}/lib/cmake/protobuf")
|
||||
endif()
|
||||
message("grpc using Protobuf_DIR : " ${_FINDPACKAGE_PROTOBUF_CONFIG_DIR})
|
||||
|
||||
if (EXISTS ${absl_ROOT}/lib64)
|
||||
set(_FINDPACKAGE_ABSL_CONFIG_DIR "${absl_ROOT}/lib64/cmake/absl")
|
||||
else()
|
||||
set(_FINDPACKAGE_ABSL_CONFIG_DIR "${absl_ROOT}/lib/cmake/absl")
|
||||
endif()
|
||||
message("grpc using absl_DIR : " ${_FINDPACKAGE_ABSL_CONFIG_DIR})
|
||||
|
||||
set(_CMAKE_ARGS_OPENSSL_ROOT_DIR "")
|
||||
if (OPENSSL_ROOT_DIR)
|
||||
set(_CMAKE_ARGS_OPENSSL_ROOT_DIR "-DOPENSSL_ROOT_DIR:PATH=${OPENSSL_ROOT_DIR}")
|
||||
endif()
|
||||
|
||||
mindspore_add_pkg(grpc
|
||||
VER 1.27.3
|
||||
LIBS grpc++ grpc gpr upb address_sorting
|
||||
EXE grpc_cpp_plugin
|
||||
URL https://github.com/grpc/grpc/archive/v1.27.3.tar.gz
|
||||
MD5 0c6c3fc8682d4262dd0e5e6fabe1a7e2
|
||||
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release
|
||||
-DgRPC_INSTALL:BOOL=ON
|
||||
-DgRPC_BUILD_TESTS:BOOL=OFF
|
||||
-DgRPC_PROTOBUF_PROVIDER:STRING=package
|
||||
-DgRPC_PROTOBUF_PACKAGE_TYPE:STRING=CONFIG
|
||||
-DProtobuf_DIR:PATH=${_FINDPACKAGE_PROTOBUF_CONFIG_DIR}
|
||||
-DgRPC_ZLIB_PROVIDER:STRING=package
|
||||
-DZLIB_ROOT:PATH=${zlib_ROOT}
|
||||
-DgRPC_ABSL_PROVIDER:STRING=package
|
||||
-Dabsl_DIR:PATH=${_FINDPACKAGE_ABSL_CONFIG_DIR}
|
||||
-DgRPC_CARES_PROVIDER:STRING=package
|
||||
-Dc-ares_DIR:PATH=${c-ares_ROOT}/lib/cmake/c-ares
|
||||
-DgRPC_SSL_PROVIDER:STRING=package
|
||||
${_CMAKE_ARGS_OPENSSL_ROOT_DIR}
|
||||
)
|
||||
|
||||
include_directories(${grpc_INC})
|
||||
|
||||
add_library(mindspore::grpc++ ALIAS grpc::grpc++)
|
||||
|
||||
# link other grpc libs
|
||||
target_link_libraries(grpc::grpc++ INTERFACE grpc::grpc grpc::gpr grpc::upb grpc::address_sorting)
|
||||
|
||||
# link built dependencies
|
||||
target_link_libraries(grpc::grpc++ INTERFACE mindspore::z)
|
||||
target_link_libraries(grpc::grpc++ INTERFACE mindspore::cares)
|
||||
target_link_libraries(grpc::grpc++ INTERFACE mindspore::absl_strings mindspore::absl_throw_delegate
|
||||
mindspore::absl_raw_logging_internal mindspore::absl_int128 mindspore::absl_bad_optional_access)
|
||||
|
||||
# link system openssl
|
||||
find_package(OpenSSL REQUIRED)
|
||||
target_link_libraries(grpc::grpc++ INTERFACE OpenSSL::SSL OpenSSL::Crypto)
|
||||
|
||||
|
||||
function(ms_grpc_generate c_var h_var)
|
||||
if(NOT ARGN)
|
||||
message(SEND_ERROR "Error: ms_grpc_generate() called without any proto files")
|
||||
return()
|
||||
endif()
|
||||
|
||||
set(${c_var})
|
||||
set(${h_var})
|
||||
|
||||
foreach(file ${ARGN})
|
||||
get_filename_component(abs_file ${file} ABSOLUTE)
|
||||
get_filename_component(file_name ${file} NAME_WE)
|
||||
get_filename_component(file_dir ${abs_file} PATH)
|
||||
file(RELATIVE_PATH rel_path ${CMAKE_CURRENT_SOURCE_DIR} ${file_dir})
|
||||
|
||||
list(APPEND ${c_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.pb.cc")
|
||||
list(APPEND ${h_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.pb.h")
|
||||
list(APPEND ${c_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.cc")
|
||||
list(APPEND ${h_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.h")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_BINARY_DIR}/proto/${file_name}.pb.cc"
|
||||
"${CMAKE_BINARY_DIR}/proto/${file_name}.pb.h"
|
||||
"${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.cc"
|
||||
"${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.h"
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_BINARY_DIR}/proto"
|
||||
COMMAND protobuf::protoc --version
|
||||
COMMAND protobuf::protoc -I${file_dir} --cpp_out=${CMAKE_BINARY_DIR}/proto
|
||||
--grpc_out=${CMAKE_BINARY_DIR}/proto --plugin=protoc-gen-grpc=$<TARGET_FILE:grpc::grpc_cpp_plugin> ${abs_file}
|
||||
DEPENDS protobuf::protoc grpc::grpc_cpp_plugin ${abs_file}
|
||||
COMMENT "Running C++ gRPC compiler on ${file}" VERBATIM)
|
||||
endforeach()
|
||||
|
||||
set_source_files_properties(${${c_var}} ${${h_var}} PROPERTIES GENERATED TRUE)
|
||||
set(${c_var} ${${c_var}} PARENT_SCOPE)
|
||||
set(${h_var} ${${h_var}} PARENT_SCOPE)
|
||||
|
||||
endfunction()
|
@ -0,0 +1,14 @@
|
||||
set(pslite_USE_STATIC_LIBS ON)
|
||||
if (${ENABLE_IBVERBS} STREQUAL "ON")
|
||||
set(pslite_CXXFLAGS "USE_IBVERBS=1")
|
||||
endif()
|
||||
mindspore_add_pkg(pslite
|
||||
LIBS ps
|
||||
URL https://github.com/dmlc/ps-lite/archive/34fd45cae457d59850fdcb2066467778d0673f21.zip
|
||||
MD5 393c0e27b68bfaf96718caa3aa96f5a3
|
||||
PATCHES ${CMAKE_SOURCE_DIR}/third_party/patch/pslite/ps_lite.patch001
|
||||
ONLY_MAKE True
|
||||
ONLY_MAKE_INCS include/*
|
||||
ONLY_MAKE_LIBS build/*)
|
||||
include_directories(${pslite_INC})
|
||||
add_library(mindspore::pslite ALIAS pslite::ps)
|
@ -0,0 +1,5 @@
|
||||
mindspore_add_pkg(zeromq
|
||||
VER 4.1.4
|
||||
HEAD_ONLY ./
|
||||
URL https://raw.githubusercontent.com/mli/deps/master/build/zeromq-4.1.4.tar.gz
|
||||
MD5 a611ecc93fffeb6d058c0e6edf4ad4fb)
|
@ -0,0 +1,9 @@
|
||||
mindspore_add_pkg(zlib
|
||||
VER 1.2.11
|
||||
LIBS z
|
||||
URL https://github.com/madler/zlib/archive/v1.2.11.tar.gz
|
||||
MD5 0095d2d2d1f3442ce1318336637b695f
|
||||
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release)
|
||||
|
||||
include_directories(${zlib_INC})
|
||||
add_library(mindspore::z ALIAS zlib::z)
|
@ -0,0 +1,67 @@
|
||||
FROM ubuntu:18.04
|
||||
|
||||
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
|
||||
|
||||
# Set env
|
||||
ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5
|
||||
ENV PATH /usr/local/bin:$PATH
|
||||
|
||||
# Install base tools
|
||||
RUN apt update \
|
||||
&& DEBIAN_FRONTEND=noninteractive apt install -y \
|
||||
vim \
|
||||
wget \
|
||||
curl \
|
||||
xz-utils \
|
||||
net-tools \
|
||||
openssh-client \
|
||||
git \
|
||||
ntpdate \
|
||||
tzdata \
|
||||
tcl \
|
||||
sudo \
|
||||
bash-completion
|
||||
|
||||
# Install compile tools
|
||||
RUN DEBIAN_FRONTEND=noninteractive apt install -y \
|
||||
gcc \
|
||||
g++ \
|
||||
zlibc \
|
||||
make \
|
||||
libgmp-dev \
|
||||
patch \
|
||||
autoconf \
|
||||
libtool \
|
||||
automake \
|
||||
flex
|
||||
|
||||
# Set bash
|
||||
RUN echo "dash dash/sh boolean false" | debconf-set-selections
|
||||
RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash
|
||||
|
||||
# Install python (v3.7.5)
|
||||
RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
|
||||
libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \
|
||||
&& cd /tmp \
|
||||
&& wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \
|
||||
&& tar -xvf v3.7.5.tar.gz \
|
||||
&& cd /tmp/cpython-3.7.5 \
|
||||
&& mkdir -p ${PYTHON_ROOT_PATH} \
|
||||
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
|
||||
&& make -j4 \
|
||||
&& make install -j4 \
|
||||
&& rm -f /usr/local/bin/python \
|
||||
&& rm -f /usr/local/bin/pip \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
|
||||
&& rm -rf /tmp/cpython-3.7.5 \
|
||||
&& rm -f /tmp/v3.7.5.tar.gz
|
||||
|
||||
# Set pip source
|
||||
RUN mkdir -pv /root/.pip \
|
||||
&& echo "[global]" > /root/.pip/pip.conf \
|
||||
&& echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \
|
||||
&& echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf
|
||||
|
||||
# Install MindSpore cpu whl package
|
||||
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.5.0-beta/MindSpore/cpu/ubuntu_x86/mindspore-0.5.0-cp37-cp37m-linux_x86_64.whl
|
@ -0,0 +1,83 @@
|
||||
FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
|
||||
|
||||
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
|
||||
|
||||
# Set env
|
||||
ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5
|
||||
ENV OMPI_ROOT_PATH /usr/local/openmpi-3.1.5
|
||||
ENV PATH ${OMPI_ROOT_PATH}/bin:/usr/local/bin:$PATH
|
||||
ENV LD_LIBRARY_PATH ${OMPI_ROOT_PATH}/lib:$LD_LIBRARY_PATH
|
||||
|
||||
# Install base tools
|
||||
RUN apt update \
|
||||
&& DEBIAN_FRONTEND=noninteractive apt install -y \
|
||||
vim \
|
||||
wget \
|
||||
curl \
|
||||
xz-utils \
|
||||
net-tools \
|
||||
openssh-client \
|
||||
git \
|
||||
ntpdate \
|
||||
tzdata \
|
||||
tcl \
|
||||
sudo \
|
||||
bash-completion
|
||||
|
||||
# Install compile tools
|
||||
RUN DEBIAN_FRONTEND=noninteractive apt install -y \
|
||||
gcc \
|
||||
g++ \
|
||||
zlibc \
|
||||
make \
|
||||
libgmp-dev \
|
||||
patch \
|
||||
autoconf \
|
||||
libtool \
|
||||
automake \
|
||||
flex \
|
||||
libnccl2=2.4.8-1+cuda10.1 \
|
||||
libnccl-dev=2.4.8-1+cuda10.1
|
||||
|
||||
# Set bash
|
||||
RUN echo "dash dash/sh boolean false" | debconf-set-selections
|
||||
RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash
|
||||
|
||||
# Install python (v3.7.5)
|
||||
RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
|
||||
libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \
|
||||
&& cd /tmp \
|
||||
&& wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \
|
||||
&& tar -xvf v3.7.5.tar.gz \
|
||||
&& cd /tmp/cpython-3.7.5 \
|
||||
&& mkdir -p ${PYTHON_ROOT_PATH} \
|
||||
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
|
||||
&& make -j4 \
|
||||
&& make install -j4 \
|
||||
&& rm -f /usr/local/bin/python \
|
||||
&& rm -f /usr/local/bin/pip \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
|
||||
&& rm -rf /tmp/cpython-3.7.5 \
|
||||
&& rm -f /tmp/v3.7.5.tar.gz
|
||||
|
||||
# Set pip source
|
||||
RUN mkdir -pv /root/.pip \
|
||||
&& echo "[global]" > /root/.pip/pip.conf \
|
||||
&& echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \
|
||||
&& echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf
|
||||
|
||||
# Install openmpi (v3.1.5)
|
||||
RUN cd /tmp \
|
||||
&& wget https://download.open-mpi.org/release/open-mpi/v3.1/openmpi-3.1.5.tar.gz \
|
||||
&& tar -xvf openmpi-3.1.5.tar.gz \
|
||||
&& cd /tmp/openmpi-3.1.5 \
|
||||
&& mkdir -p ${OMPI_ROOT_PATH} \
|
||||
&& ./configure --prefix=${OMPI_ROOT_PATH} \
|
||||
&& make -j4 \
|
||||
&& make install -j4 \
|
||||
&& rm -rf /tmp/openmpi-3.1.5 \
|
||||
&& rm -f /tmp/openmpi-3.1.5.tar.gz
|
||||
|
||||
# Install MindSpore cuda-10.1 whl package
|
||||
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.5.0-beta/MindSpore/gpu/ubuntu_x86/cuda-10.1/mindspore_gpu-0.5.0-cp37-cp37m-linux_x86_64.whl
|
@ -1,82 +0,0 @@
|
||||
# Guideline to Convert Training Data CLUERNER2020 to MindRecord For Bert Fine Tuning
|
||||
|
||||
<!-- TOC -->
|
||||
|
||||
- [What does the example do](#what-does-the-example-do)
|
||||
- [How to use the example to process CLUERNER2020](#how-to-use-the-example-to-process-cluerner2020)
|
||||
- [Download CLUERNER2020 and unzip](#download-cluerner2020-and-unzip)
|
||||
- [Generate MindRecord](#generate-mindrecord)
|
||||
- [Create MindDataset By MindRecord](#create-minddataset-by-mindrecord)
|
||||
|
||||
|
||||
<!-- /TOC -->
|
||||
|
||||
## What does the example do
|
||||
|
||||
This example is based on [CLUERNER2020](https://www.cluebenchmarks.com/introduce.html) training data, generating MindRecord file, and finally used for Bert Fine Tuning progress.
|
||||
|
||||
1. run.sh: generate MindRecord entry script
|
||||
2. run_read.py: create MindDataset by MindRecord entry script.
|
||||
- create_dataset.py: use MindDataset to read MindRecord to generate dataset.
|
||||
|
||||
## How to use the example to process CLUERNER2020
|
||||
|
||||
Download CLUERNER2020, convert it to MindRecord, use MindDataset to read MindRecord.
|
||||
|
||||
### Download CLUERNER2020 and unzip
|
||||
|
||||
1. Download the training data zip.
|
||||
> [CLUERNER2020 dataset download address](https://www.cluebenchmarks.com/introduce.html) **-> 任务介绍 -> CLUENER 细粒度命名实体识别 -> cluener下载链接**
|
||||
|
||||
2. Unzip the training data to dir example/nlp_to_mindrecord/CLUERNER2020/cluener_public.
|
||||
```
|
||||
unzip -d {your-mindspore}/example/nlp_to_mindrecord/CLUERNER2020/data/cluener_public cluener_public.zip
|
||||
```
|
||||
|
||||
### Generate MindRecord
|
||||
|
||||
1. Run the run.sh script.
|
||||
```bash
|
||||
bash run.sh
|
||||
```
|
||||
|
||||
2. Output like this:
|
||||
```
|
||||
...
|
||||
[INFO] ME(17603:139620983514944,MainProcess):2020-04-28-16:56:12.498.235 [mindspore/mindrecord/filewriter.py:313] The list of mindrecord files created are: ['data/train.mindrecord'], and the list of index files are: ['data/train.mindrecord.db']
|
||||
...
|
||||
[INFO] ME(17603,python):2020-04-28-16:56:13.400.175 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
|
||||
[INFO] ME(17603,python):2020-04-28-16:56:13.400.863 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
|
||||
[INFO] ME(17603,python):2020-04-28-16:56:13.401.534 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
|
||||
[INFO] ME(17603,python):2020-04-28-16:56:13.402.179 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
|
||||
[INFO] ME(17603,python):2020-04-28-16:56:13.402.702 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
|
||||
...
|
||||
[INFO] ME(17603:139620983514944,MainProcess):2020-04-28-16:56:13.431.208 [mindspore/mindrecord/filewriter.py:313] The list of mindrecord files created are: ['data/dev.mindrecord'], and the list of index files are: ['data/dev.mindrecord.db']
|
||||
```
|
||||
|
||||
3. Generate files like this:
|
||||
```bash
|
||||
$ ls output/
|
||||
dev.mindrecord dev.mindrecord.db README.md train.mindrecord train.mindrecord.db
|
||||
```
|
||||
|
||||
### Create MindDataset By MindRecord
|
||||
|
||||
1. Run the run_read.sh script.
|
||||
```bash
|
||||
bash run_read.sh
|
||||
```
|
||||
|
||||
2. Output like this:
|
||||
```
|
||||
...
|
||||
example 1340: input_ids: [ 101 3173 1290 4852 7676 3949 122 3299 123 126 3189 4510 8020 6381 5442 7357 2590 3636 8021 7676 3949 4294 1166 6121 3124 1277 6121 3124 7270 2135 3295 5789 3326 123 126 3189 1355 6134 1093 1325 3173 2399 6590 6791 8024 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|
||||
example 1340: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|
||||
example 1340: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|
||||
example 1340: label_ids: [ 0 18 19 20 2 4 0 0 0 0 0 0 0 34 36 26 27 28 0 34 35 35 35 35 35 35 35 35 35 36 26 27 28 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|
||||
example 1341: input_ids: [ 101 1728 711 4293 3868 1168 2190 2150 3791 934 3633 3428 4638 6237 7025 8024 3297 1400 5310 3362 6206 5023 5401 1744 3297 7770 3791 7368 976 1139 1104 2137 511 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|
||||
example 1341: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|
||||
example 1341: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|
||||
example 1341: label_ids: [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 19 19 19 19 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|
||||
...
|
||||
```
|
@ -1,36 +0,0 @@
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""create MindDataset by MindRecord"""
|
||||
import mindspore.dataset as ds
|
||||
|
||||
def create_dataset(data_file):
|
||||
"""create MindDataset"""
|
||||
num_readers = 4
|
||||
data_set = ds.MindDataset(dataset_file=data_file, num_parallel_workers=num_readers, shuffle=True)
|
||||
index = 0
|
||||
for item in data_set.create_dict_iterator():
|
||||
# print("example {}: {}".format(index, item))
|
||||
print("example {}: input_ids: {}".format(index, item['input_ids']))
|
||||
print("example {}: input_mask: {}".format(index, item['input_mask']))
|
||||
print("example {}: segment_ids: {}".format(index, item['segment_ids']))
|
||||
print("example {}: label_ids: {}".format(index, item['label_ids']))
|
||||
index += 1
|
||||
if index % 1000 == 0:
|
||||
print("read rows: {}".format(index))
|
||||
print("total rows: {}".format(index))
|
||||
|
||||
if __name__ == '__main__':
|
||||
create_dataset('output/train.mindrecord')
|
||||
create_dataset('output/dev.mindrecord')
|
@ -1 +0,0 @@
|
||||
cluener_public
|
@ -1 +0,0 @@
|
||||
## output dir
|
@ -1,40 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
rm -f output/train.mindrecord*
|
||||
rm -f output/dev.mindrecord*
|
||||
|
||||
if [ ! -d "../../../third_party/to_mindrecord/CLUERNER2020" ]; then
|
||||
echo "The patch base dir ../../../third_party/to_mindrecord/CLUERNER2020 is not exist."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f "../../../third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch" ]; then
|
||||
echo "The patch file ../../../third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch is not exist."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# patch for data_processor_seq.py
|
||||
patch -p0 -d ../../../third_party/to_mindrecord/CLUERNER2020/ -o data_processor_seq_patched.py < ../../../third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Patch ../../../third_party/to_mindrecord/CLUERNER2020/data_processor_seq.py failed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# use patched script
|
||||
python ../../../third_party/to_mindrecord/CLUERNER2020/data_processor_seq_patched.py \
|
||||
--vocab_file=../../../third_party/to_mindrecord/CLUERNER2020/vocab.txt \
|
||||
--label2id_file=../../../third_party/to_mindrecord/CLUERNER2020/label2id.json
|
@ -1 +0,0 @@
|
||||
## The input dataset
|
@ -1,173 +0,0 @@
|
||||
# Guideline to Convert Training Data enwiki to MindRecord For Bert Pre Training
|
||||
|
||||
<!-- TOC -->
|
||||
|
||||
- [What does the example do](#what-does-the-example-do)
|
||||
- [How to use the example to process enwiki](#how-to-use-the-example-to-process-enwiki)
|
||||
- [Download enwiki training data](#download-enwiki-training-data)
|
||||
- [Process the enwiki](#process-the-enwiki)
|
||||
- [Generate MindRecord](#generate-mindrecord)
|
||||
- [Create MindDataset By MindRecord](#create-minddataset-by-mindrecord)
|
||||
|
||||
|
||||
<!-- /TOC -->
|
||||
|
||||
## What does the example do
|
||||
|
||||
This example is based on [enwiki](https://dumps.wikimedia.org/enwiki) training data, generating MindRecord file, and finally used for Bert network training.
|
||||
|
||||
1. run.sh: generate MindRecord entry script.
|
||||
2. run_read.py: create MindDataset by MindRecord entry script.
|
||||
- create_dataset.py: use MindDataset to read MindRecord to generate dataset.
|
||||
|
||||
## How to use the example to process enwiki
|
||||
|
||||
Download enwiki data, process it, convert it to MindRecord, use MindDataset to read MindRecord.
|
||||
|
||||
### Download enwiki training data
|
||||
|
||||
> [enwiki dataset download address](https://dumps.wikimedia.org/enwiki) **-> 20200501 -> enwiki-20200501-pages-articles-multistream.xml.bz2**
|
||||
|
||||
### Process the enwiki
|
||||
|
||||
1. Please follow the steps in [process enwiki](https://github.com/mlperf/training/tree/master/language_model/tensorflow/bert)
|
||||
- All permissions of this step belong to the link address website.
|
||||
|
||||
### Generate MindRecord
|
||||
|
||||
1. Run the run.sh script.
|
||||
```
|
||||
bash run.sh input_dir output_dir vocab_file
|
||||
```
|
||||
- input_dir: the directory which contains files like 'part-00251-of-00500'.
|
||||
- output_dir: which will store the output mindrecord files.
|
||||
- vocab_file: the vocab file which you can download from other opensource project.
|
||||
|
||||
2. The output like this:
|
||||
```
|
||||
...
|
||||
Begin preprocess Wed Jun 10 09:21:23 CST 2020
|
||||
Begin preprocess input file: /mnt/data/results/part-00000-of-00500
|
||||
Begin output file: part-00000-of-00500.mindrecord
|
||||
Total task: 510, processing: 1
|
||||
Begin preprocess input file: /mnt/data/results/part-00001-of-00500
|
||||
Begin output file: part-00001-of-00500.mindrecord
|
||||
Total task: 510, processing: 2
|
||||
Begin preprocess input file: /mnt/data/results/part-00002-of-00500
|
||||
Begin output file: part-00002-of-00500.mindrecord
|
||||
Total task: 510, processing: 3
|
||||
Begin preprocess input file: /mnt/data/results/part-00003-of-00500
|
||||
Begin output file: part-00003-of-00500.mindrecord
|
||||
Total task: 510, processing: 4
|
||||
Begin preprocess input file: /mnt/data/results/part-00004-of-00500
|
||||
Begin output file: part-00004-of-00500.mindrecord
|
||||
Total task: 510, processing: 4
|
||||
...
|
||||
```
|
||||
|
||||
3. Generate files like this:
|
||||
```bash
|
||||
$ ls {your_output_dir}/
|
||||
part-00000-of-00500.mindrecord part-00000-of-00500.mindrecord.db part-00001-of-00500.mindrecord part-00001-of-00500.mindrecord.db part-00002-of-00500.mindrecord part-00002-of-00500.mindrecord.db ...
|
||||
```
|
||||
|
||||
### Create MindDataset By MindRecord
|
||||
|
||||
1. Run the run_read.sh script.
|
||||
```bash
|
||||
bash run_read.sh input_dir
|
||||
```
|
||||
- input_dir: the directory which contains mindrecord files.
|
||||
|
||||
2. The output like this:
|
||||
```
|
||||
...
|
||||
example 633: input_ids: [ 101 2043 19781 4305 2140 4520 2041 1010 103 2034 2455 2002
|
||||
7879 2003 1996 2455 1997 103 26378 4160 1012 102 7291 2001
|
||||
1996 103 1011 2343 1997 6327 1010 3423 1998 103 4262 2005
|
||||
1996 2118 1997 2329 3996 103 102 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0]
|
||||
example 633: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
|
||||
1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|
||||
example 633: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
|
||||
1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|
||||
example 633: masked_lm_positions: [ 8 17 20 25 33 41 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0]
|
||||
example 633: masked_lm_ids: [ 1996 16137 1012 3580 2451 1012 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0]
|
||||
example 633: masked_lm_weights: [1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
|
||||
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
|
||||
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
|
||||
0. 0. 0. 0.]
|
||||
example 633: next_sentence_labels: [1]
|
||||
...
|
||||
```
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue