You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
209 lines
12 KiB
209 lines
12 KiB
python tokenize_corpus.py --corpus_folder /{path}/corpus --output_folder /{path}/tokenized_corpus --tokenizer nltk --pool_size 16
|
|
cd tokenized_corpus/
|
|
|
|
# build bpe codes
|
|
cat *.txt | subword-nmt learn-bpe -s 46000 -o all.bpe.codes
|
|
|
|
# build bpe dict
|
|
"subword-nmt get-vocab -i tokenized.txt -o vocab_en.dict.bin"
|
|
|
|
# apply bpe encoding
|
|
python apply_bpe_encoding.py --codes ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/all.bpe.codes \
|
|
--src_folder ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/ \
|
|
--output_folder ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/bpe \
|
|
--vocab_path ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/vocab_en.dict.bin \
|
|
--processes 32
|
|
|
|
# build dataset news crawl
|
|
python news_crawl.py --src_folder ./news_crawl \
|
|
--dict_folder ./news_crawl \
|
|
--existed_vocab ./tokenized_corpus/vocab_en.dict.bin \
|
|
--mask_ratio 0.5 \
|
|
--output_folder ./news_crawl/dataset/tf_small_pretrain \
|
|
--max_len 128 \
|
|
--processes 32 \
|
|
--ngram 2
|
|
|
|
# build dataset cnndm
|
|
python cnn_dm.py --test_src ./cnndm_data_prophetnet/prophetnet_tokenized/test.src.txt --test_ref ./cnndm_data_prophetnet/prophetnet_tokenized/test.tgt.txt --existed_vocab ./cnndm_data_prophetnet/cnndm_torch_prophetnet_30522.bin --noise_prob 0.0 --output_folder ./cnndm_data_prophetnet/dataset_hugging_face_tokenized/ --max_len 512
|
|
|
|
|
|
# train
|
|
bash run_gpu.sh --task t --device_num 1 --device_id 3 --config ./config/config.json
|
|
|
|
# inference
|
|
bash run_gpu.sh --task i \
|
|
--device_num 1 \
|
|
--device_id 3 \
|
|
--config ./config/test.json \
|
|
--output output \
|
|
--metric rouge \
|
|
--vocab ./cnndm_data_prophetnet/cnndm_torch_prophetnet_30522.bin
|
|
|
|
# pytorch model structure
|
|
NgramTransformerProphetModel(
|
|
(encoder): TransformerEncoder(
|
|
(embed_tokens): Embedding(30522, 512, padding_idx=0)
|
|
(embed_positions): LearnedPositionalEmbedding(513, 512, padding_idx=0)
|
|
(layers): ModuleList(
|
|
(0): TransformerEncoderLayer(
|
|
(self_attn): MultiheadAttention(
|
|
(k_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(v_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(q_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(out_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
)
|
|
(self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
(fc1): Linear(in_features=512, out_features=2048, bias=True)
|
|
(fc2): Linear(in_features=2048, out_features=512, bias=True)
|
|
(final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
(1): TransformerEncoderLayer(
|
|
(self_attn): MultiheadAttention(
|
|
(k_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(v_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(q_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(out_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
)
|
|
(self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
(fc1): Linear(in_features=512, out_features=2048, bias=True)
|
|
(fc2): Linear(in_features=2048, out_features=512, bias=True)
|
|
(final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
(2): TransformerEncoderLayer(
|
|
(self_attn): MultiheadAttention(
|
|
(k_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(v_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(q_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(out_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
)
|
|
(self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
(fc1): Linear(in_features=512, out_features=2048, bias=True)
|
|
(fc2): Linear(in_features=2048, out_features=512, bias=True)
|
|
(final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(emb_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
(decoder): NgramTransformerDecoder(
|
|
(embed_tokens): Embedding(30522, 512, padding_idx=0)
|
|
(embed_positions): LearnedPositionalEmbedding(514, 512, padding_idx=0)
|
|
(ngram_input_embed): Embedding(2, 512)
|
|
(layers): ModuleList(
|
|
(0): NgramTransformerDecoderLayer(
|
|
(ngram_self_attn): NgramMultiheadAttention(
|
|
(relative_linear): Linear(in_features=512, out_features=256, bias=True)
|
|
(out_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
)
|
|
(self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
(encoder_attn): MultiheadAttention(
|
|
(k_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(v_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(q_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(out_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
)
|
|
(encoder_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
(fc1): Linear(in_features=512, out_features=2048, bias=True)
|
|
(fc2): Linear(in_features=2048, out_features=512, bias=True)
|
|
(final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
(1): NgramTransformerDecoderLayer(
|
|
(ngram_self_attn): NgramMultiheadAttention(
|
|
(relative_linear): Linear(in_features=512, out_features=256, bias=True)
|
|
(out_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
)
|
|
(self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
(encoder_attn): MultiheadAttention(
|
|
(k_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(v_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(q_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(out_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
)
|
|
(encoder_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
(fc1): Linear(in_features=512, out_features=2048, bias=True)
|
|
(fc2): Linear(in_features=2048, out_features=512, bias=True)
|
|
(final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
(2): NgramTransformerDecoderLayer(
|
|
(ngram_self_attn): NgramMultiheadAttention(
|
|
(relative_linear): Linear(in_features=512, out_features=256, bias=True)
|
|
(out_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
)
|
|
(self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
(encoder_attn): MultiheadAttention(
|
|
(k_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(v_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(q_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
(out_proj): Linear(in_features=512, out_features=512, bias=True)
|
|
)
|
|
(encoder_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
(fc1): Linear(in_features=512, out_features=2048, bias=True)
|
|
(fc2): Linear(in_features=2048, out_features=512, bias=True)
|
|
(final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(emb_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
|
|
data example:
|
|
src_tokens
|
|
tensor([[ 1996, 11555, 18172, 7042, 2055, 1037, 18147, 5913, 3756, 6982,
|
|
1999, 1996, 4120, 1012, 2007, 1996, 4022, 2000, 2022, 3621,
|
|
2062, 4795, 1010, 2021, 2074, 2004, 26102, 1010, 1996, 7726,
|
|
3212, 2038, 2042, 27696, 1996, 6745, 2804, 2000, 2049, 4170,
|
|
1011, 1037, 8235, 4408, 28653, 2630, 6982, 1012, 11216, 1997,
|
|
1996, 27143, 1011, 2550, 21905, 2442, 2031, 2245, 2008, 1996,
|
|
13576, 8703, 2052, 2191, 1996, 7477, 12586, 1999, 2007, 1996,
|
|
2784, 5380, 1997, 1996, 2152, 11915, 1012, 17186, 2091, 2005,
|
|
2678, 1012, 3239, 1011, 9105, 1024, 7726, 3212, 9058, 2020,
|
|
4760, 2125, 2037, 4408, 28653, 12622, 2006, 2110, 2547, 1012,
|
|
18783, 1024, 7726, 3212, 3738, 3233, 2006, 2327, 1997, 1996,
|
|
8254, 2050, 1021, 6982, 2328, 27143, 1012, 2021, 2009, 1005,
|
|
1055, 2524, 2000, 2903, 2008, 1996, 4099, 2180, 1005, 1056,
|
|
2156, 2023, 2028, 2746, 2007, 1996, 6120, 2437, 2009, 3233,
|
|
2041, 2066, 1037, 14699, 7639, 2114, 1996, 2300, 1005, 1055,
|
|
3302, 1012, 1996, 3212, 2001, 4760, 2125, 1996, 3239, 1011,
|
|
9105, 4325, 1010, 2029, 2003, 2105, 1996, 2946, 1997, 1037,
|
|
15437, 1010, 2006, 4238, 2110, 2547, 7483, 1012, 3212, 4584,
|
|
1010, 2738, 4603, 2135, 5102, 1999, 5810, 2601, 11408, 4102,
|
|
2000, 2037, 28190, 2911, 1010, 3427, 2004, 1996, 8254, 2050,
|
|
1011, 1021, 1010, 6055, 2007, 3424, 1011, 2911, 10815, 1010,
|
|
2001, 3390, 2012, 24112, 2099, 17532, 1010, 2379, 1996, 6143,
|
|
11195, 1997, 7570, 10867, 17040, 1012, 2048, 2047, 7726, 1011,
|
|
2328, 1043, 16102, 4313, 4942, 2015, 1998, 2048, 13671, 25215,
|
|
11890, 27528, 2102, 2020, 2036, 5359, 2000, 1996, 3212, 1012,
|
|
8235, 2630, 1024, 4238, 1005, 1055, 4397, 3390, 1043, 16102,
|
|
4313, 6982, 5829, 1999, 2392, 1997, 1037, 4049, 1999, 1996,
|
|
2670, 3417, 1997, 24112, 2099, 17532, 1999, 1996, 4723, 6084,
|
|
1012, 19194, 1024, 1996, 12622, 3233, 2041, 2066, 1037, 14699,
|
|
1011, 7639, 2114, 1996, 3302, 1997, 1996, 2712, 1012, 3212,
|
|
2708, 4373, 5902, 5292, 28065, 14511, 4430, 2360, 13380, 2072,
|
|
2001, 9339, 2006, 7726, 2547, 2004, 3038, 2008, 1996, 3842,
|
|
2442, 10295, 1996, 1005, 14751, 2974, 1998, 2327, 1011, 3694,
|
|
4128, 2000, 4047, 2049, 6645, 1012, 1005, 1043, 16102, 4313,
|
|
2465, 12622, 2064, 2543, 10815, 1998, 18544, 2012, 1996, 2168,
|
|
2051, 1010, 1998, 2064, 5452, 1999, 1996, 4723, 6084, 1005,
|
|
1055, 8467, 5380, 1012, 4238, 2038, 4912, 2000, 12200, 2049,
|
|
2250, 3639, 1998, 3987, 9859, 1010, 3038, 2151, 2825, 2925,
|
|
4491, 2006, 2009, 2052, 2272, 2013, 1996, 2250, 1998, 2712,
|
|
1012, 1996, 2406, 2085, 4447, 2000, 2022, 1005, 2969, 7182,
|
|
1005, 1999, 3408, 1997, 17731, 3941, 2000, 3113, 2049, 2510,
|
|
3791, 1012, 14430, 1024, 1996, 7726, 6982, 1005, 1055, 2453,
|
|
2022, 2062, 9252, 2084, 1996, 11555, 1005, 21864, 15952, 3756,
|
|
6982, 1010, 15885, 1010, 2021, 2027, 2024, 8053, 14224, 11401,
|
|
1012, 102]], device='cuda:0')
|
|
prev_output_tokens
|
|
tensor([[ 102, 7726, 2110, 2547, 3662, 8333, 1997, 1996, 2047, 3719,
|
|
1011, 1037, 8254, 2050, 1021, 6982, 1010, 2048, 1043, 16102,
|
|
4313, 4942, 2015, 1998, 1037, 3940, 1997, 25215, 11890, 27528,
|
|
2102, 1012, 2, 3212, 4584, 2360, 2008, 1996, 4170, 2442,
|
|
10295, 1005, 1996, 14751, 2974, 1005, 2000, 4047, 2049, 6645,
|
|
1012]], device='cuda:0')
|
|
target_tokens:
|
|
tensor([[ 7726, 2110, 2547, 3662, 8333, 1997, 1996, 2047, 3719, 1011,
|
|
1037, 8254, 2050, 1021, 6982, 1010, 2048, 1043, 16102, 4313,
|
|
4942, 2015, 1998, 1037, 3940, 1997, 25215, 11890, 27528, 2102,
|
|
1012, 2, 3212, 4584, 2360, 2008, 1996, 4170, 2442, 10295,
|
|
1005, 1996, 14751, 2974, 1005, 2000, 4047, 2049, 6645, 1012,
|
|
102]], device='cuda:0') |