You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
209 lines
12 KiB
209 lines
12 KiB
python --corpus_folder /{path}/corpus --output_folder /{path}/tokenized_corpus --tokenizer nltk --pool_size 16
cd tokenized_corpus/
# build bpe codes
cat *.txt | subword-nmt learn-bpe -s 46000 -o
# build bpe dict
"subword-nmt get-vocab -i tokenized.txt -o vocab_en.dict.bin"
# apply bpe encoding
python --codes ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/ \
--src_folder ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/ \
--output_folder ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/bpe \
--vocab_path ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/vocab_en.dict.bin \
--processes 32
# build dataset news crawl
python --src_folder ./news_crawl \
--dict_folder ./news_crawl \
--existed_vocab ./tokenized_corpus/vocab_en.dict.bin \
--mask_ratio 0.5 \
--output_folder ./news_crawl/dataset/tf_small_pretrain \
--max_len 128 \
--processes 32 \
--ngram 2
# build dataset cnndm
python --test_src ./cnndm_data_prophetnet/prophetnet_tokenized/test.src.txt --test_ref ./cnndm_data_prophetnet/prophetnet_tokenized/test.tgt.txt --existed_vocab ./cnndm_data_prophetnet/cnndm_torch_prophetnet_30522.bin --noise_prob 0.0 --output_folder ./cnndm_data_prophetnet/dataset_hugging_face_tokenized/ --max_len 512
# train
bash --task t --device_num 1 --device_id 3 --config ./config/config.json
# inference
bash --task i \
--device_num 1 \
--device_id 3 \
--config ./config/test.json \
--output output \
--metric rouge \
--vocab ./cnndm_data_prophetnet/cnndm_torch_prophetnet_30522.bin
# pytorch model structure
(encoder): TransformerEncoder(
(embed_tokens): Embedding(30522, 512, padding_idx=0)
(embed_positions): LearnedPositionalEmbedding(513, 512, padding_idx=0)
(layers): ModuleList(
(0): TransformerEncoderLayer(
(self_attn): MultiheadAttention(
(k_proj): Linear(in_features=512, out_features=512, bias=True)
(v_proj): Linear(in_features=512, out_features=512, bias=True)
(q_proj): Linear(in_features=512, out_features=512, bias=True)
(out_proj): Linear(in_features=512, out_features=512, bias=True)
(self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(fc1): Linear(in_features=512, out_features=2048, bias=True)
(fc2): Linear(in_features=2048, out_features=512, bias=True)
(final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(1): TransformerEncoderLayer(
(self_attn): MultiheadAttention(
(k_proj): Linear(in_features=512, out_features=512, bias=True)
(v_proj): Linear(in_features=512, out_features=512, bias=True)
(q_proj): Linear(in_features=512, out_features=512, bias=True)
(out_proj): Linear(in_features=512, out_features=512, bias=True)
(self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(fc1): Linear(in_features=512, out_features=2048, bias=True)
(fc2): Linear(in_features=2048, out_features=512, bias=True)
(final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(2): TransformerEncoderLayer(
(self_attn): MultiheadAttention(
(k_proj): Linear(in_features=512, out_features=512, bias=True)
(v_proj): Linear(in_features=512, out_features=512, bias=True)
(q_proj): Linear(in_features=512, out_features=512, bias=True)
(out_proj): Linear(in_features=512, out_features=512, bias=True)
(self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(fc1): Linear(in_features=512, out_features=2048, bias=True)
(fc2): Linear(in_features=2048, out_features=512, bias=True)
(final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(emb_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(decoder): NgramTransformerDecoder(
(embed_tokens): Embedding(30522, 512, padding_idx=0)
(embed_positions): LearnedPositionalEmbedding(514, 512, padding_idx=0)
(ngram_input_embed): Embedding(2, 512)
(layers): ModuleList(
(0): NgramTransformerDecoderLayer(
(ngram_self_attn): NgramMultiheadAttention(
(relative_linear): Linear(in_features=512, out_features=256, bias=True)
(out_proj): Linear(in_features=512, out_features=512, bias=True)
(self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(encoder_attn): MultiheadAttention(
(k_proj): Linear(in_features=512, out_features=512, bias=True)
(v_proj): Linear(in_features=512, out_features=512, bias=True)
(q_proj): Linear(in_features=512, out_features=512, bias=True)
(out_proj): Linear(in_features=512, out_features=512, bias=True)
(encoder_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(fc1): Linear(in_features=512, out_features=2048, bias=True)
(fc2): Linear(in_features=2048, out_features=512, bias=True)
(final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(1): NgramTransformerDecoderLayer(
(ngram_self_attn): NgramMultiheadAttention(
(relative_linear): Linear(in_features=512, out_features=256, bias=True)
(out_proj): Linear(in_features=512, out_features=512, bias=True)
(self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(encoder_attn): MultiheadAttention(
(k_proj): Linear(in_features=512, out_features=512, bias=True)
(v_proj): Linear(in_features=512, out_features=512, bias=True)
(q_proj): Linear(in_features=512, out_features=512, bias=True)
(out_proj): Linear(in_features=512, out_features=512, bias=True)
(encoder_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(fc1): Linear(in_features=512, out_features=2048, bias=True)
(fc2): Linear(in_features=2048, out_features=512, bias=True)
(final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(2): NgramTransformerDecoderLayer(
(ngram_self_attn): NgramMultiheadAttention(
(relative_linear): Linear(in_features=512, out_features=256, bias=True)
(out_proj): Linear(in_features=512, out_features=512, bias=True)
(self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(encoder_attn): MultiheadAttention(
(k_proj): Linear(in_features=512, out_features=512, bias=True)
(v_proj): Linear(in_features=512, out_features=512, bias=True)
(q_proj): Linear(in_features=512, out_features=512, bias=True)
(out_proj): Linear(in_features=512, out_features=512, bias=True)
(encoder_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(fc1): Linear(in_features=512, out_features=2048, bias=True)
(fc2): Linear(in_features=2048, out_features=512, bias=True)
(final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(emb_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
data example:
tensor([[ 1996, 11555, 18172, 7042, 2055, 1037, 18147, 5913, 3756, 6982,
1999, 1996, 4120, 1012, 2007, 1996, 4022, 2000, 2022, 3621,
2062, 4795, 1010, 2021, 2074, 2004, 26102, 1010, 1996, 7726,
3212, 2038, 2042, 27696, 1996, 6745, 2804, 2000, 2049, 4170,
1011, 1037, 8235, 4408, 28653, 2630, 6982, 1012, 11216, 1997,
1996, 27143, 1011, 2550, 21905, 2442, 2031, 2245, 2008, 1996,
13576, 8703, 2052, 2191, 1996, 7477, 12586, 1999, 2007, 1996,
2784, 5380, 1997, 1996, 2152, 11915, 1012, 17186, 2091, 2005,
2678, 1012, 3239, 1011, 9105, 1024, 7726, 3212, 9058, 2020,
4760, 2125, 2037, 4408, 28653, 12622, 2006, 2110, 2547, 1012,
18783, 1024, 7726, 3212, 3738, 3233, 2006, 2327, 1997, 1996,
8254, 2050, 1021, 6982, 2328, 27143, 1012, 2021, 2009, 1005,
1055, 2524, 2000, 2903, 2008, 1996, 4099, 2180, 1005, 1056,
2156, 2023, 2028, 2746, 2007, 1996, 6120, 2437, 2009, 3233,
2041, 2066, 1037, 14699, 7639, 2114, 1996, 2300, 1005, 1055,
3302, 1012, 1996, 3212, 2001, 4760, 2125, 1996, 3239, 1011,
9105, 4325, 1010, 2029, 2003, 2105, 1996, 2946, 1997, 1037,
15437, 1010, 2006, 4238, 2110, 2547, 7483, 1012, 3212, 4584,
1010, 2738, 4603, 2135, 5102, 1999, 5810, 2601, 11408, 4102,
2000, 2037, 28190, 2911, 1010, 3427, 2004, 1996, 8254, 2050,
1011, 1021, 1010, 6055, 2007, 3424, 1011, 2911, 10815, 1010,
2001, 3390, 2012, 24112, 2099, 17532, 1010, 2379, 1996, 6143,
11195, 1997, 7570, 10867, 17040, 1012, 2048, 2047, 7726, 1011,
2328, 1043, 16102, 4313, 4942, 2015, 1998, 2048, 13671, 25215,
11890, 27528, 2102, 2020, 2036, 5359, 2000, 1996, 3212, 1012,
8235, 2630, 1024, 4238, 1005, 1055, 4397, 3390, 1043, 16102,
4313, 6982, 5829, 1999, 2392, 1997, 1037, 4049, 1999, 1996,
2670, 3417, 1997, 24112, 2099, 17532, 1999, 1996, 4723, 6084,
1012, 19194, 1024, 1996, 12622, 3233, 2041, 2066, 1037, 14699,
1011, 7639, 2114, 1996, 3302, 1997, 1996, 2712, 1012, 3212,
2708, 4373, 5902, 5292, 28065, 14511, 4430, 2360, 13380, 2072,
2001, 9339, 2006, 7726, 2547, 2004, 3038, 2008, 1996, 3842,
2442, 10295, 1996, 1005, 14751, 2974, 1998, 2327, 1011, 3694,
4128, 2000, 4047, 2049, 6645, 1012, 1005, 1043, 16102, 4313,
2465, 12622, 2064, 2543, 10815, 1998, 18544, 2012, 1996, 2168,
2051, 1010, 1998, 2064, 5452, 1999, 1996, 4723, 6084, 1005,
1055, 8467, 5380, 1012, 4238, 2038, 4912, 2000, 12200, 2049,
2250, 3639, 1998, 3987, 9859, 1010, 3038, 2151, 2825, 2925,
4491, 2006, 2009, 2052, 2272, 2013, 1996, 2250, 1998, 2712,
1012, 1996, 2406, 2085, 4447, 2000, 2022, 1005, 2969, 7182,
1005, 1999, 3408, 1997, 17731, 3941, 2000, 3113, 2049, 2510,
3791, 1012, 14430, 1024, 1996, 7726, 6982, 1005, 1055, 2453,
2022, 2062, 9252, 2084, 1996, 11555, 1005, 21864, 15952, 3756,
6982, 1010, 15885, 1010, 2021, 2027, 2024, 8053, 14224, 11401,
1012, 102]], device='cuda:0')
tensor([[ 102, 7726, 2110, 2547, 3662, 8333, 1997, 1996, 2047, 3719,
1011, 1037, 8254, 2050, 1021, 6982, 1010, 2048, 1043, 16102,
4313, 4942, 2015, 1998, 1037, 3940, 1997, 25215, 11890, 27528,
2102, 1012, 2, 3212, 4584, 2360, 2008, 1996, 4170, 2442,
10295, 1005, 1996, 14751, 2974, 1005, 2000, 4047, 2049, 6645,
1012]], device='cuda:0')
tensor([[ 7726, 2110, 2547, 3662, 8333, 1997, 1996, 2047, 3719, 1011,
1037, 8254, 2050, 1021, 6982, 1010, 2048, 1043, 16102, 4313,
4942, 2015, 1998, 1037, 3940, 1997, 25215, 11890, 27528, 2102,
1012, 2, 3212, 4584, 2360, 2008, 1996, 4170, 2442, 10295,
1005, 1996, 14751, 2974, 1005, 2000, 4047, 2049, 6645, 1012,
102]], device='cuda:0') |