After Width: | Height: | Size: 32 KiB |
After Width: | Height: | Size: 45 KiB |
After Width: | Height: | Size: 1.1 KiB |
After Width: | Height: | Size: 989 B |
After Width: | Height: | Size: 1.6 KiB |
@ -0,0 +1,63 @@
|
||||
# Prune
|
||||
|
||||
## Motivation
|
||||
|
||||
We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement
|
||||
`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc`
|
||||
and generate a pruned `ProgramDesc`.
|
||||
|
||||
## Challenge
|
||||
|
||||
Pruning need to support both variables and operators being evaluation targets. Consider the following
|
||||
different situations.
|
||||
|
||||
```python
|
||||
# Case 1: run foward pass.
|
||||
cost_np = session.run(target=cost)
|
||||
# Case 2: run backward passing.
|
||||
opts_np, _ = session.run(target=[cost, opt])
|
||||
# Case 3: run checkpointing
|
||||
_ = session.run(target=checkpoint)
|
||||
```
|
||||
|
||||
## Solution
|
||||
|
||||
To support evaluation of operators, we add `is_target` field in the `OpDesc`.
|
||||
|
||||
```c++
|
||||
message OpDesc {
|
||||
required string type = 3;
|
||||
repeated Var inputs = 1;
|
||||
repeated Var outputs = 2;
|
||||
repeated Attr attrs = 4;
|
||||
optional bool is_target = 5 [ default = false ];
|
||||
};
|
||||
```
|
||||
|
||||
To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599).
|
||||
For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being
|
||||
`fetch_op`'s input. Then we also set `fetch_op` is a target.
|
||||
|
||||
### Algorithm
|
||||
|
||||
If an operator needs to be run, it must fall into one of the following cases:
|
||||
|
||||
1. It is the target.
|
||||
2. It is depended by some other ops, meaning its output is some other op's input.
|
||||
|
||||
The first case can be checked by `op_desc.is_traget()` . The second case can be implement as
|
||||
|
||||
```c++
|
||||
bool HasDependentVar(const OpDesc& op_desc, const std::set<string>& dependent_vars) {
|
||||
for (auto& var : op_desc.outputs()) {
|
||||
for (auto& argu : var.arguments()) {
|
||||
if (dependent_vars.count(argu) != 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
```
|
||||
|
||||
Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc).
|
After Width: | Height: | Size: 142 KiB |
After Width: | Height: | Size: 33 KiB |
@ -0,0 +1,100 @@
|
||||
import gzip
|
||||
import math
|
||||
|
||||
import paddle.v2 as paddle
|
||||
|
||||
embsize = 32
|
||||
hiddensize = 256
|
||||
N = 5
|
||||
|
||||
|
||||
def wordemb(inlayer):
|
||||
wordemb = paddle.layer.embedding(
|
||||
input=inlayer,
|
||||
size=embsize,
|
||||
param_attr=paddle.attr.Param(
|
||||
name="_proj",
|
||||
initial_std=0.001,
|
||||
learning_rate=1,
|
||||
l2_rate=0,
|
||||
sparse_update=True))
|
||||
return wordemb
|
||||
|
||||
|
||||
def main():
|
||||
# for local training
|
||||
cluster_train = False
|
||||
|
||||
if not cluster_train:
|
||||
paddle.init(use_gpu=False, trainer_count=1)
|
||||
else:
|
||||
paddle.init(
|
||||
use_gpu=False,
|
||||
trainer_count=2,
|
||||
port=7164,
|
||||
ports_num=1,
|
||||
ports_num_for_sparse=1,
|
||||
num_gradient_servers=1)
|
||||
word_dict = paddle.dataset.imikolov.build_dict()
|
||||
dict_size = len(word_dict)
|
||||
firstword = paddle.layer.data(
|
||||
name="firstw", type=paddle.data_type.integer_value(dict_size))
|
||||
secondword = paddle.layer.data(
|
||||
name="secondw", type=paddle.data_type.integer_value(dict_size))
|
||||
thirdword = paddle.layer.data(
|
||||
name="thirdw", type=paddle.data_type.integer_value(dict_size))
|
||||
fourthword = paddle.layer.data(
|
||||
name="fourthw", type=paddle.data_type.integer_value(dict_size))
|
||||
nextword = paddle.layer.data(
|
||||
name="fifthw", type=paddle.data_type.integer_value(dict_size))
|
||||
|
||||
Efirst = wordemb(firstword)
|
||||
Esecond = wordemb(secondword)
|
||||
Ethird = wordemb(thirdword)
|
||||
Efourth = wordemb(fourthword)
|
||||
|
||||
contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
|
||||
hidden1 = paddle.layer.fc(input=contextemb,
|
||||
size=hiddensize,
|
||||
act=paddle.activation.Sigmoid(),
|
||||
layer_attr=paddle.attr.Extra(drop_rate=0.5),
|
||||
bias_attr=paddle.attr.Param(learning_rate=2),
|
||||
param_attr=paddle.attr.Param(
|
||||
initial_std=1. / math.sqrt(embsize * 8),
|
||||
learning_rate=1))
|
||||
predictword = paddle.layer.fc(input=hidden1,
|
||||
size=dict_size,
|
||||
bias_attr=paddle.attr.Param(learning_rate=2),
|
||||
act=paddle.activation.Softmax())
|
||||
|
||||
def event_handler(event):
|
||||
if isinstance(event, paddle.event.EndIteration):
|
||||
if event.batch_id % 100 == 0:
|
||||
with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
|
||||
'w') as f:
|
||||
trainer.save_parameter_to_tar(f)
|
||||
result = trainer.test(
|
||||
paddle.batch(
|
||||
paddle.dataset.imikolov.test(word_dict, N), 32))
|
||||
print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
|
||||
event.pass_id, event.batch_id, event.cost, event.metrics,
|
||||
result.metrics)
|
||||
|
||||
cost = paddle.layer.classification_cost(input=predictword, label=nextword)
|
||||
|
||||
parameters = paddle.parameters.create(cost)
|
||||
adagrad = paddle.optimizer.AdaGrad(
|
||||
learning_rate=3e-3,
|
||||
regularization=paddle.optimizer.L2Regularization(8e-4))
|
||||
trainer = paddle.trainer.SGD(cost,
|
||||
parameters,
|
||||
adagrad,
|
||||
is_local=not cluster_train)
|
||||
trainer.train(
|
||||
paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
|
||||
num_passes=30,
|
||||
event_handler=event_handler)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,123 @@
|
||||
import math
|
||||
import os
|
||||
import paddle.v2 as paddle
|
||||
import pickle
|
||||
|
||||
embsize = 32
|
||||
hiddensize = 256
|
||||
N = 5
|
||||
cluster_train_file = "./train_data_dir/train/train.txt"
|
||||
cluster_test_file = "./test_data_dir/test/test.txt"
|
||||
node_id = os.getenv("OMPI_COMM_WORLD_RANK")
|
||||
if not node_id:
|
||||
raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
|
||||
|
||||
|
||||
def wordemb(inlayer):
|
||||
wordemb = paddle.layer.embedding(
|
||||
input=inlayer,
|
||||
size=embsize,
|
||||
param_attr=paddle.attr.Param(
|
||||
name="_proj",
|
||||
initial_std=0.001,
|
||||
learning_rate=1,
|
||||
l2_rate=0,
|
||||
sparse_update=True))
|
||||
return wordemb
|
||||
|
||||
|
||||
def cluster_reader_cluster(filename, node_id):
|
||||
def cluster_reader():
|
||||
with open("-".join([filename, "%05d" % int(node_id)]), "r") as f:
|
||||
for l in f:
|
||||
csv_data = [int(cell) for cell in l.split(",")]
|
||||
yield tuple(csv_data)
|
||||
|
||||
return cluster_reader
|
||||
|
||||
|
||||
def main():
|
||||
# get arguments from env
|
||||
|
||||
# for local training
|
||||
TRUTH = ["true", "True", "TRUE", "1", "yes", "Yes", "YES"]
|
||||
cluster_train = os.getenv('PADDLE_CLUSTER_TRAIN', "False") in TRUTH
|
||||
use_gpu = os.getenv('PADDLE_INIT_USE_GPU', "False")
|
||||
|
||||
if not cluster_train:
|
||||
paddle.init(
|
||||
use_gpu=use_gpu,
|
||||
trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")))
|
||||
else:
|
||||
paddle.init(
|
||||
use_gpu=use_gpu,
|
||||
trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")),
|
||||
port=int(os.getenv("PADDLE_INIT_PORT", "7164")),
|
||||
ports_num=int(os.getenv("PADDLE_INIT_PORTS_NUM", "1")),
|
||||
ports_num_for_sparse=int(
|
||||
os.getenv("PADDLE_INIT_PORTS_NUM_FOR_SPARSE", "1")),
|
||||
num_gradient_servers=int(
|
||||
os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "1")),
|
||||
trainer_id=int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")),
|
||||
pservers=os.getenv("PADDLE_INIT_PSERVERS", "127.0.0.1"))
|
||||
fn = open("thirdparty/wuyi_train_thdpty/word_dict.pickle", "r")
|
||||
word_dict = pickle.load(fn)
|
||||
fn.close()
|
||||
dict_size = len(word_dict)
|
||||
firstword = paddle.layer.data(
|
||||
name="firstw", type=paddle.data_type.integer_value(dict_size))
|
||||
secondword = paddle.layer.data(
|
||||
name="secondw", type=paddle.data_type.integer_value(dict_size))
|
||||
thirdword = paddle.layer.data(
|
||||
name="thirdw", type=paddle.data_type.integer_value(dict_size))
|
||||
fourthword = paddle.layer.data(
|
||||
name="fourthw", type=paddle.data_type.integer_value(dict_size))
|
||||
nextword = paddle.layer.data(
|
||||
name="fifthw", type=paddle.data_type.integer_value(dict_size))
|
||||
|
||||
Efirst = wordemb(firstword)
|
||||
Esecond = wordemb(secondword)
|
||||
Ethird = wordemb(thirdword)
|
||||
Efourth = wordemb(fourthword)
|
||||
|
||||
contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
|
||||
hidden1 = paddle.layer.fc(input=contextemb,
|
||||
size=hiddensize,
|
||||
act=paddle.activation.Sigmoid(),
|
||||
layer_attr=paddle.attr.Extra(drop_rate=0.5),
|
||||
bias_attr=paddle.attr.Param(learning_rate=2),
|
||||
param_attr=paddle.attr.Param(
|
||||
initial_std=1. / math.sqrt(embsize * 8),
|
||||
learning_rate=1))
|
||||
predictword = paddle.layer.fc(input=hidden1,
|
||||
size=dict_size,
|
||||
bias_attr=paddle.attr.Param(learning_rate=2),
|
||||
act=paddle.activation.Softmax())
|
||||
|
||||
def event_handler(event):
|
||||
if isinstance(event, paddle.event.EndIteration):
|
||||
if event.batch_id % 100 == 0:
|
||||
result = trainer.test(
|
||||
paddle.batch(
|
||||
cluster_reader_cluster(cluster_test_file, node_id), 32))
|
||||
print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
|
||||
event.pass_id, event.batch_id, event.cost, event.metrics,
|
||||
result.metrics)
|
||||
|
||||
cost = paddle.layer.classification_cost(input=predictword, label=nextword)
|
||||
parameters = paddle.parameters.create(cost)
|
||||
adagrad = paddle.optimizer.AdaGrad(
|
||||
learning_rate=3e-3,
|
||||
regularization=paddle.optimizer.L2Regularization(8e-4))
|
||||
trainer = paddle.trainer.SGD(cost,
|
||||
parameters,
|
||||
adagrad,
|
||||
is_local=not cluster_train)
|
||||
trainer.train(
|
||||
paddle.batch(cluster_reader_cluster(cluster_train_file, node_id), 32),
|
||||
num_passes=30,
|
||||
event_handler=event_handler)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,41 @@
|
||||
import paddle.v2 as paddle
|
||||
import tarfile
|
||||
import os
|
||||
import pickle
|
||||
|
||||
SPLIT_COUNT = 3
|
||||
N = 5
|
||||
|
||||
|
||||
def file_len(fd):
|
||||
for i, l in enumerate(fd):
|
||||
pass
|
||||
return i + 1
|
||||
|
||||
|
||||
def split_from_reader_by_line(filename, reader, split_count):
|
||||
fn = open(filename, "w")
|
||||
for batch_id, batch_data in enumerate(reader()):
|
||||
batch_data_str = [str(d) for d in batch_data]
|
||||
fn.write(",".join(batch_data_str))
|
||||
fn.write("\n")
|
||||
fn.close()
|
||||
|
||||
fn = open(filename, "r")
|
||||
total_line_count = file_len(fn)
|
||||
fn.close()
|
||||
per_file_lines = total_line_count / split_count + 1
|
||||
cmd = "split -d -a 5 -l %d %s %s-" % (per_file_lines, filename, filename)
|
||||
os.system(cmd)
|
||||
|
||||
|
||||
word_dict = paddle.dataset.imikolov.build_dict()
|
||||
with open("word_dict.pickle", "w") as dict_f:
|
||||
pickle.dump(word_dict, dict_f)
|
||||
|
||||
split_from_reader_by_line("train.txt",
|
||||
paddle.dataset.imikolov.train(word_dict, N),
|
||||
SPLIT_COUNT)
|
||||
split_from_reader_by_line("test.txt",
|
||||
paddle.dataset.imikolov.test(word_dict, N),
|
||||
SPLIT_COUNT)
|