After Width: | Height: | Size: 32 KiB |
After Width: | Height: | Size: 45 KiB |
After Width: | Height: | Size: 1.1 KiB |
After Width: | Height: | Size: 989 B |
After Width: | Height: | Size: 1.6 KiB |
@ -0,0 +1,63 @@
|
|||||||
|
# Prune
|
||||||
|
|
||||||
|
## Motivation
|
||||||
|
|
||||||
|
We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement
|
||||||
|
`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc`
|
||||||
|
and generate a pruned `ProgramDesc`.
|
||||||
|
|
||||||
|
## Challenge
|
||||||
|
|
||||||
|
Pruning need to support both variables and operators being evaluation targets. Consider the following
|
||||||
|
different situations.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Case 1: run foward pass.
|
||||||
|
cost_np = session.run(target=cost)
|
||||||
|
# Case 2: run backward passing.
|
||||||
|
opts_np, _ = session.run(target=[cost, opt])
|
||||||
|
# Case 3: run checkpointing
|
||||||
|
_ = session.run(target=checkpoint)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Solution
|
||||||
|
|
||||||
|
To support evaluation of operators, we add `is_target` field in the `OpDesc`.
|
||||||
|
|
||||||
|
```c++
|
||||||
|
message OpDesc {
|
||||||
|
required string type = 3;
|
||||||
|
repeated Var inputs = 1;
|
||||||
|
repeated Var outputs = 2;
|
||||||
|
repeated Attr attrs = 4;
|
||||||
|
optional bool is_target = 5 [ default = false ];
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599).
|
||||||
|
For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being
|
||||||
|
`fetch_op`'s input. Then we also set `fetch_op` is a target.
|
||||||
|
|
||||||
|
### Algorithm
|
||||||
|
|
||||||
|
If an operator needs to be run, it must fall into one of the following cases:
|
||||||
|
|
||||||
|
1. It is the target.
|
||||||
|
2. It is depended by some other ops, meaning its output is some other op's input.
|
||||||
|
|
||||||
|
The first case can be checked by `op_desc.is_traget()` . The second case can be implement as
|
||||||
|
|
||||||
|
```c++
|
||||||
|
bool HasDependentVar(const OpDesc& op_desc, const std::set<string>& dependent_vars) {
|
||||||
|
for (auto& var : op_desc.outputs()) {
|
||||||
|
for (auto& argu : var.arguments()) {
|
||||||
|
if (dependent_vars.count(argu) != 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc).
|
After Width: | Height: | Size: 142 KiB |
After Width: | Height: | Size: 33 KiB |
@ -0,0 +1,100 @@
|
|||||||
|
import gzip
|
||||||
|
import math
|
||||||
|
|
||||||
|
import paddle.v2 as paddle
|
||||||
|
|
||||||
|
embsize = 32
|
||||||
|
hiddensize = 256
|
||||||
|
N = 5
|
||||||
|
|
||||||
|
|
||||||
|
def wordemb(inlayer):
|
||||||
|
wordemb = paddle.layer.embedding(
|
||||||
|
input=inlayer,
|
||||||
|
size=embsize,
|
||||||
|
param_attr=paddle.attr.Param(
|
||||||
|
name="_proj",
|
||||||
|
initial_std=0.001,
|
||||||
|
learning_rate=1,
|
||||||
|
l2_rate=0,
|
||||||
|
sparse_update=True))
|
||||||
|
return wordemb
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# for local training
|
||||||
|
cluster_train = False
|
||||||
|
|
||||||
|
if not cluster_train:
|
||||||
|
paddle.init(use_gpu=False, trainer_count=1)
|
||||||
|
else:
|
||||||
|
paddle.init(
|
||||||
|
use_gpu=False,
|
||||||
|
trainer_count=2,
|
||||||
|
port=7164,
|
||||||
|
ports_num=1,
|
||||||
|
ports_num_for_sparse=1,
|
||||||
|
num_gradient_servers=1)
|
||||||
|
word_dict = paddle.dataset.imikolov.build_dict()
|
||||||
|
dict_size = len(word_dict)
|
||||||
|
firstword = paddle.layer.data(
|
||||||
|
name="firstw", type=paddle.data_type.integer_value(dict_size))
|
||||||
|
secondword = paddle.layer.data(
|
||||||
|
name="secondw", type=paddle.data_type.integer_value(dict_size))
|
||||||
|
thirdword = paddle.layer.data(
|
||||||
|
name="thirdw", type=paddle.data_type.integer_value(dict_size))
|
||||||
|
fourthword = paddle.layer.data(
|
||||||
|
name="fourthw", type=paddle.data_type.integer_value(dict_size))
|
||||||
|
nextword = paddle.layer.data(
|
||||||
|
name="fifthw", type=paddle.data_type.integer_value(dict_size))
|
||||||
|
|
||||||
|
Efirst = wordemb(firstword)
|
||||||
|
Esecond = wordemb(secondword)
|
||||||
|
Ethird = wordemb(thirdword)
|
||||||
|
Efourth = wordemb(fourthword)
|
||||||
|
|
||||||
|
contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
|
||||||
|
hidden1 = paddle.layer.fc(input=contextemb,
|
||||||
|
size=hiddensize,
|
||||||
|
act=paddle.activation.Sigmoid(),
|
||||||
|
layer_attr=paddle.attr.Extra(drop_rate=0.5),
|
||||||
|
bias_attr=paddle.attr.Param(learning_rate=2),
|
||||||
|
param_attr=paddle.attr.Param(
|
||||||
|
initial_std=1. / math.sqrt(embsize * 8),
|
||||||
|
learning_rate=1))
|
||||||
|
predictword = paddle.layer.fc(input=hidden1,
|
||||||
|
size=dict_size,
|
||||||
|
bias_attr=paddle.attr.Param(learning_rate=2),
|
||||||
|
act=paddle.activation.Softmax())
|
||||||
|
|
||||||
|
def event_handler(event):
|
||||||
|
if isinstance(event, paddle.event.EndIteration):
|
||||||
|
if event.batch_id % 100 == 0:
|
||||||
|
with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
|
||||||
|
'w') as f:
|
||||||
|
trainer.save_parameter_to_tar(f)
|
||||||
|
result = trainer.test(
|
||||||
|
paddle.batch(
|
||||||
|
paddle.dataset.imikolov.test(word_dict, N), 32))
|
||||||
|
print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
|
||||||
|
event.pass_id, event.batch_id, event.cost, event.metrics,
|
||||||
|
result.metrics)
|
||||||
|
|
||||||
|
cost = paddle.layer.classification_cost(input=predictword, label=nextword)
|
||||||
|
|
||||||
|
parameters = paddle.parameters.create(cost)
|
||||||
|
adagrad = paddle.optimizer.AdaGrad(
|
||||||
|
learning_rate=3e-3,
|
||||||
|
regularization=paddle.optimizer.L2Regularization(8e-4))
|
||||||
|
trainer = paddle.trainer.SGD(cost,
|
||||||
|
parameters,
|
||||||
|
adagrad,
|
||||||
|
is_local=not cluster_train)
|
||||||
|
trainer.train(
|
||||||
|
paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
|
||||||
|
num_passes=30,
|
||||||
|
event_handler=event_handler)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1,123 @@
|
|||||||
|
import math
|
||||||
|
import os
|
||||||
|
import paddle.v2 as paddle
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
embsize = 32
|
||||||
|
hiddensize = 256
|
||||||
|
N = 5
|
||||||
|
cluster_train_file = "./train_data_dir/train/train.txt"
|
||||||
|
cluster_test_file = "./test_data_dir/test/test.txt"
|
||||||
|
node_id = os.getenv("OMPI_COMM_WORLD_RANK")
|
||||||
|
if not node_id:
|
||||||
|
raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
|
||||||
|
|
||||||
|
|
||||||
|
def wordemb(inlayer):
|
||||||
|
wordemb = paddle.layer.embedding(
|
||||||
|
input=inlayer,
|
||||||
|
size=embsize,
|
||||||
|
param_attr=paddle.attr.Param(
|
||||||
|
name="_proj",
|
||||||
|
initial_std=0.001,
|
||||||
|
learning_rate=1,
|
||||||
|
l2_rate=0,
|
||||||
|
sparse_update=True))
|
||||||
|
return wordemb
|
||||||
|
|
||||||
|
|
||||||
|
def cluster_reader_cluster(filename, node_id):
|
||||||
|
def cluster_reader():
|
||||||
|
with open("-".join([filename, "%05d" % int(node_id)]), "r") as f:
|
||||||
|
for l in f:
|
||||||
|
csv_data = [int(cell) for cell in l.split(",")]
|
||||||
|
yield tuple(csv_data)
|
||||||
|
|
||||||
|
return cluster_reader
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# get arguments from env
|
||||||
|
|
||||||
|
# for local training
|
||||||
|
TRUTH = ["true", "True", "TRUE", "1", "yes", "Yes", "YES"]
|
||||||
|
cluster_train = os.getenv('PADDLE_CLUSTER_TRAIN', "False") in TRUTH
|
||||||
|
use_gpu = os.getenv('PADDLE_INIT_USE_GPU', "False")
|
||||||
|
|
||||||
|
if not cluster_train:
|
||||||
|
paddle.init(
|
||||||
|
use_gpu=use_gpu,
|
||||||
|
trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")))
|
||||||
|
else:
|
||||||
|
paddle.init(
|
||||||
|
use_gpu=use_gpu,
|
||||||
|
trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")),
|
||||||
|
port=int(os.getenv("PADDLE_INIT_PORT", "7164")),
|
||||||
|
ports_num=int(os.getenv("PADDLE_INIT_PORTS_NUM", "1")),
|
||||||
|
ports_num_for_sparse=int(
|
||||||
|
os.getenv("PADDLE_INIT_PORTS_NUM_FOR_SPARSE", "1")),
|
||||||
|
num_gradient_servers=int(
|
||||||
|
os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "1")),
|
||||||
|
trainer_id=int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")),
|
||||||
|
pservers=os.getenv("PADDLE_INIT_PSERVERS", "127.0.0.1"))
|
||||||
|
fn = open("thirdparty/wuyi_train_thdpty/word_dict.pickle", "r")
|
||||||
|
word_dict = pickle.load(fn)
|
||||||
|
fn.close()
|
||||||
|
dict_size = len(word_dict)
|
||||||
|
firstword = paddle.layer.data(
|
||||||
|
name="firstw", type=paddle.data_type.integer_value(dict_size))
|
||||||
|
secondword = paddle.layer.data(
|
||||||
|
name="secondw", type=paddle.data_type.integer_value(dict_size))
|
||||||
|
thirdword = paddle.layer.data(
|
||||||
|
name="thirdw", type=paddle.data_type.integer_value(dict_size))
|
||||||
|
fourthword = paddle.layer.data(
|
||||||
|
name="fourthw", type=paddle.data_type.integer_value(dict_size))
|
||||||
|
nextword = paddle.layer.data(
|
||||||
|
name="fifthw", type=paddle.data_type.integer_value(dict_size))
|
||||||
|
|
||||||
|
Efirst = wordemb(firstword)
|
||||||
|
Esecond = wordemb(secondword)
|
||||||
|
Ethird = wordemb(thirdword)
|
||||||
|
Efourth = wordemb(fourthword)
|
||||||
|
|
||||||
|
contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
|
||||||
|
hidden1 = paddle.layer.fc(input=contextemb,
|
||||||
|
size=hiddensize,
|
||||||
|
act=paddle.activation.Sigmoid(),
|
||||||
|
layer_attr=paddle.attr.Extra(drop_rate=0.5),
|
||||||
|
bias_attr=paddle.attr.Param(learning_rate=2),
|
||||||
|
param_attr=paddle.attr.Param(
|
||||||
|
initial_std=1. / math.sqrt(embsize * 8),
|
||||||
|
learning_rate=1))
|
||||||
|
predictword = paddle.layer.fc(input=hidden1,
|
||||||
|
size=dict_size,
|
||||||
|
bias_attr=paddle.attr.Param(learning_rate=2),
|
||||||
|
act=paddle.activation.Softmax())
|
||||||
|
|
||||||
|
def event_handler(event):
|
||||||
|
if isinstance(event, paddle.event.EndIteration):
|
||||||
|
if event.batch_id % 100 == 0:
|
||||||
|
result = trainer.test(
|
||||||
|
paddle.batch(
|
||||||
|
cluster_reader_cluster(cluster_test_file, node_id), 32))
|
||||||
|
print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
|
||||||
|
event.pass_id, event.batch_id, event.cost, event.metrics,
|
||||||
|
result.metrics)
|
||||||
|
|
||||||
|
cost = paddle.layer.classification_cost(input=predictword, label=nextword)
|
||||||
|
parameters = paddle.parameters.create(cost)
|
||||||
|
adagrad = paddle.optimizer.AdaGrad(
|
||||||
|
learning_rate=3e-3,
|
||||||
|
regularization=paddle.optimizer.L2Regularization(8e-4))
|
||||||
|
trainer = paddle.trainer.SGD(cost,
|
||||||
|
parameters,
|
||||||
|
adagrad,
|
||||||
|
is_local=not cluster_train)
|
||||||
|
trainer.train(
|
||||||
|
paddle.batch(cluster_reader_cluster(cluster_train_file, node_id), 32),
|
||||||
|
num_passes=30,
|
||||||
|
event_handler=event_handler)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1,41 @@
|
|||||||
|
import paddle.v2 as paddle
|
||||||
|
import tarfile
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
SPLIT_COUNT = 3
|
||||||
|
N = 5
|
||||||
|
|
||||||
|
|
||||||
|
def file_len(fd):
|
||||||
|
for i, l in enumerate(fd):
|
||||||
|
pass
|
||||||
|
return i + 1
|
||||||
|
|
||||||
|
|
||||||
|
def split_from_reader_by_line(filename, reader, split_count):
|
||||||
|
fn = open(filename, "w")
|
||||||
|
for batch_id, batch_data in enumerate(reader()):
|
||||||
|
batch_data_str = [str(d) for d in batch_data]
|
||||||
|
fn.write(",".join(batch_data_str))
|
||||||
|
fn.write("\n")
|
||||||
|
fn.close()
|
||||||
|
|
||||||
|
fn = open(filename, "r")
|
||||||
|
total_line_count = file_len(fn)
|
||||||
|
fn.close()
|
||||||
|
per_file_lines = total_line_count / split_count + 1
|
||||||
|
cmd = "split -d -a 5 -l %d %s %s-" % (per_file_lines, filename, filename)
|
||||||
|
os.system(cmd)
|
||||||
|
|
||||||
|
|
||||||
|
word_dict = paddle.dataset.imikolov.build_dict()
|
||||||
|
with open("word_dict.pickle", "w") as dict_f:
|
||||||
|
pickle.dump(word_dict, dict_f)
|
||||||
|
|
||||||
|
split_from_reader_by_line("train.txt",
|
||||||
|
paddle.dataset.imikolov.train(word_dict, N),
|
||||||
|
SPLIT_COUNT)
|
||||||
|
split_from_reader_by_line("test.txt",
|
||||||
|
paddle.dataset.imikolov.test(word_dict, N),
|
||||||
|
SPLIT_COUNT)
|