diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index efb4dcb2df..980a97a07c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,9 +22,11 @@ hooks: - id: clang-formater - repo: https://github.com/PaddlePaddle/pre-commit-golang - sha: 16398aeccf263adaf53b2495eed0406347d76281 + sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 hooks: - - id: go-fmt - types: [go] - - id: gometalinter - types: [go] + - id: go-fmt + types: + - go + - id: gometalinter + types: + - go diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 3e6cedbb0d..f7483f6be9 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -7,17 +7,8 @@ INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3) ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} - # for latest version, please get from official website - # URL "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz" - # URL_MD5 "1a47e78efe365a97de0c022d127607c3" - - # for no-ssl http support, please get from bazel's mirror - # URL "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz" - # URL_MD5 "4645c66075982da6fa0bcf6b20f3e8f7" - - # get from github mirror GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" - GIT_TAG "a46d2e7337c4656f00abe54a8115f6d76153a048" + GIT_TAG "master" PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 34fd348893..ef31c25203 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -153,7 +153,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF) # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. # So, don't set these flags here. -LIST(APPEND CUDA_NVCC_FLAGS -std=c++11) +LIST(APPEND CUDA_NVCC_FLAGS -std=c++11 --default-stream per-thread) LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math) if(CMAKE_BUILD_TYPE STREQUAL "Debug") diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go index 287da69491..739c4c01e0 100644 --- a/go/cmd/master/master.go +++ b/go/cmd/master/master.go @@ -19,6 +19,8 @@ import ( "net" "net/http" "net/rpc" + "os" + "os/signal" "strconv" "strings" "time" @@ -68,6 +70,20 @@ func main() { store = &master.InMemStore{} } + shutdown := func() { + log.Infoln("shutting down gracefully") + err := store.Shutdown() + if err != nil { + log.Errorln(err) + } + } + + // Guaranteed to run even panic happens. + defer shutdown() + + c := make(chan os.Signal, 1) + signal.Notify(c, os.Interrupt) + s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax) if err != nil { log.Fatal(err) @@ -84,8 +100,12 @@ func main() { log.Fatal(err) } - err = http.Serve(l, nil) - if err != nil { - log.Fatal(err) - } + go func() { + err = http.Serve(l, nil) + if err != nil { + log.Fatal(err) + } + }() + + <-c } diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go index aa81d0432b..f9cd8f87e8 100644 --- a/go/cmd/pserver/pserver.go +++ b/go/cmd/pserver/pserver.go @@ -18,6 +18,8 @@ import ( "net" "net/http" "net/rpc" + "os" + "os/signal" "strconv" "time" @@ -33,7 +35,8 @@ func main() { index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0") etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379", "comma separated endpoint string for pserver to connect to etcd") - etcdTimeout := flag.Duration("etcd-timeout", 5*time.Second, "timeout for etcd calls") + dialTimeout := flag.Duration("dial-timeout", 5*time.Second, "dial timeout") + etcdTTL := flag.Int("etcd-ttl", 5, "etcd time to live in seconds") numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job") checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path") checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds") @@ -53,7 +56,7 @@ func main() { if *index >= 0 { idx = *index } else { - e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout) + e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *dialTimeout, *etcdTTL) idx, err = e.Register(*port) candy.Must(err) @@ -67,6 +70,20 @@ func main() { } } + shutdown := func() { + log.Infoln("shutting down gracefully") + sErr := e.Shutdown() + if sErr != nil { + log.Errorln(sErr) + } + } + + // Guaranteed to run even panic happens. + defer shutdown() + + c := make(chan os.Signal, 1) + signal.Notify(c, os.Interrupt) + s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp) candy.Must(err) @@ -77,7 +94,11 @@ func main() { l, err := net.Listen("tcp", ":"+strconv.Itoa(*port)) candy.Must(err) - log.Infof("start pserver at port %d", *port) - err = http.Serve(l, nil) - candy.Must(err) + go func() { + log.Infof("start pserver at port %d", *port) + err = http.Serve(l, nil) + candy.Must(err) + }() + + <-c } diff --git a/go/glide.lock b/go/glide.lock index f71ae643d6..1f16abdf66 100644 --- a/go/glide.lock +++ b/go/glide.lock @@ -1,15 +1,105 @@ -hash: a8faea3a363468a88917ddeb3b1c9ea36886fb2c622acbad42604fa9cb4d3855 -updated: 2017-07-11T10:04:40.786745417+08:00 +hash: 2a1c0eca5c07a130e3d224f9821f96cfa37a39bf6bce141c855bbc57ef569f1c +updated: 2017-07-29T07:34:48.722757905+08:00 imports: +- name: github.com/beorn7/perks + version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9 + subpackages: + - quantile +- name: github.com/boltdb/bolt + version: 583e8937c61f1af6513608ccc75c97b6abdf4ff9 +- name: github.com/cockroachdb/cmux + version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92 - name: github.com/coreos/etcd - version: cb2a496c4ddd1c87a9f280e116649b599999ec79 + version: c31bec0f29facff13f7c3e3d948e55dd6689ed42 subpackages: + - alarm + - auth - auth/authpb + - client - clientv3 - clientv3/concurrency + - compactor + - discovery + - embed + - error + - etcdserver + - etcdserver/api + - etcdserver/api/v2http + - etcdserver/api/v2http/httptypes + - etcdserver/api/v3client + - etcdserver/api/v3election + - etcdserver/api/v3election/v3electionpb + - etcdserver/api/v3election/v3electionpb/gw + - etcdserver/api/v3lock + - etcdserver/api/v3lock/v3lockpb + - etcdserver/api/v3lock/v3lockpb/gw + - etcdserver/api/v3rpc - etcdserver/api/v3rpc/rpctypes + - etcdserver/auth - etcdserver/etcdserverpb + - etcdserver/etcdserverpb/gw + - etcdserver/membership + - etcdserver/stats + - lease + - lease/leasehttp + - lease/leasepb + - mvcc + - mvcc/backend - mvcc/mvccpb + - pkg/adt + - pkg/contention + - pkg/cors + - pkg/cpuutil + - pkg/crc + - pkg/debugutil + - pkg/fileutil + - pkg/httputil + - pkg/idutil + - pkg/ioutil + - pkg/logutil + - pkg/monotime + - pkg/netutil + - pkg/pathutil + - pkg/pbutil + - pkg/runtime + - pkg/schedule + - pkg/srv + - pkg/tlsutil + - pkg/transport + - pkg/types + - pkg/wait + - proxy/grpcproxy/adapter + - raft + - raft/raftpb + - rafthttp + - snap + - snap/snappb + - store + - version + - wal + - wal/walpb +- name: github.com/coreos/go-semver + version: 8ab6407b697782a06568d4b7f1db25550ec2e4c6 + subpackages: + - semver +- name: github.com/coreos/go-systemd + version: 48702e0da86bd25e76cfef347e2adeb434a0d0a6 + subpackages: + - daemon + - journal + - util +- name: github.com/coreos/pkg + version: 3ac0863d7acf3bc44daf49afef8919af12f704ef + subpackages: + - capnslog +- name: github.com/dgrijalva/jwt-go + version: d2709f9f1f31ebcda9651b03077758c1f3a0018c +- name: github.com/ghodss/yaml + version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7 +- name: github.com/gogo/protobuf + version: 909568be09de550ed094403c2bf8a261b5bb730a + subpackages: + - proto - name: github.com/golang/protobuf version: 4bd1920723d7b7c925de087aa32e2187708897f7 subpackages: @@ -17,14 +107,61 @@ imports: - proto - name: github.com/golang/snappy version: 553a641470496b2327abcac10b36396bd98e45c9 +- name: github.com/google/btree + version: 925471ac9e2131377a91e1595defec898166fe49 +- name: github.com/grpc-ecosystem/go-grpc-prometheus + version: 6b7015e65d366bf3f19b2b2a000a831940f0f7e0 +- name: github.com/grpc-ecosystem/grpc-gateway + version: 18d159699f2e83fc5bb9ef2f79465ca3f3122676 + subpackages: + - runtime + - runtime/internal + - utilities +- name: github.com/jonboulle/clockwork + version: 2eee05ed794112d45db504eb05aa693efd2b8b09 +- name: github.com/matttproud/golang_protobuf_extensions + version: c12348ce28de40eed0136aa2b644d0ee0650e56c + subpackages: + - pbutil - name: github.com/namsral/flag version: 71ceffbeb0ba60fccc853971bb3ed4d7d90bfd04 - name: github.com/PaddlePaddle/recordio - version: edfb82af0739c84f241c87390ec5649c7b28c129 + version: 0432dee9fd4b24fb6840fb20a8c055b0c933fb81 +- name: github.com/prometheus/client_golang + version: c5b7fccd204277076155f10851dad72b76a49317 + subpackages: + - prometheus +- name: github.com/prometheus/client_model + version: 6f3806018612930941127f2a7c6c453ba2c527d2 + subpackages: + - go +- name: github.com/prometheus/common + version: 49fee292b27bfff7f354ee0f64e1bc4850462edf + subpackages: + - expfmt + - internal/bitbucket.org/ww/goautoneg + - model +- name: github.com/prometheus/procfs + version: a1dba9ce8baed984a2495b658c82687f8157b98f + subpackages: + - xfs - name: github.com/sirupsen/logrus - version: 7f976d3a76720c4c27af2ba716b85d2e0a7e38b1 + version: a3f95b5c423586578a4e099b11a46c2479628cac - name: github.com/topicai/candy version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc +- name: github.com/ugorji/go + version: ded73eae5db7e7a0ef6f55aace87a2873c5d2b74 + subpackages: + - codec +- name: github.com/xiang90/probing + version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2 +- name: golang.org/x/crypto + version: 1351f936d976c60a0a48d728281922cf63eafb8d + repo: https://github.com/golang/crypto.git + vcs: git + subpackages: + - bcrypt + - blowfish - name: golang.org/x/net version: c8c74377599bd978aee1cf3b9b63a8634051cec2 subpackages: @@ -36,11 +173,15 @@ imports: - lex/httplex - trace - name: golang.org/x/sys - version: abf9c25f54453410d0c6668e519582a9e1115027 + version: 0f826bdd13b500be0f1d4004938ad978fcc6031e + repo: https://github.com/golang/sys.git + vcs: git subpackages: - unix - name: golang.org/x/text - version: cfdf022e86b4ecfb646e1efbd7db175dd623a8fa + version: 836efe42bb4aa16aaa17b9c155d8813d336ed720 + repo: https://github.com/golang/text.git + vcs: git subpackages: - secure/bidirule - transform @@ -60,4 +201,23 @@ imports: - stats - tap - transport -testImports: [] +- name: gopkg.in/yaml.v2 + version: cd8b52f8269e0feb286dfeef29f8fe4d5b397e0b +testImports: +- name: github.com/davecgh/go-spew + version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9 + subpackages: + - spew +- name: github.com/docker/docker + version: b6d164e6c46d8115b146e4c3ac93784e9ef8b49e + subpackages: + - pkg/ioutils + - pkg/longpath +- name: github.com/pmezard/go-difflib + version: d8ed2627bdf02c080bf22230dbb337003b7aba2d + subpackages: + - difflib +- name: github.com/stretchr/testify + version: 05e8a0eda380579888eb53c394909df027f06991 + subpackages: + - assert diff --git a/go/glide.yaml b/go/glide.yaml index ab472c7cda..bc23fa6ebf 100644 --- a/go/glide.yaml +++ b/go/glide.yaml @@ -6,8 +6,19 @@ import: subpackages: - clientv3 - clientv3/concurrency + - embed + - etcdserver - package: github.com/namsral/flag version: ^1.7.4-pre - package: github.com/sirupsen/logrus version: ^1.0.0 - package: github.com/topicai/candy +- package: golang.org/x/crypto + vcs: git + repo: https://github.com/golang/crypto.git +- package: golang.org/x/sys + vcs: git + repo: https://github.com/golang/sys.git +- package: golang.org/x/text + vcs: git + repo: https://github.com/golang/text.git diff --git a/go/master/c/client.go b/go/master/c/client.go index a2b18e4b47..b5759c30b1 100644 --- a/go/master/c/client.go +++ b/go/master/c/client.go @@ -18,7 +18,6 @@ package main #include #include #include - #define PADDLE_MASTER_OK 0 #define PADDLE_MASTER_ERROR -1 @@ -101,6 +100,12 @@ func paddle_release_master_client(client C.paddle_master_client) { remove(client) } +//export paddle_start_get_records +func paddle_start_get_records(client C.paddle_master_client, pass C.int) { + c := get(client) + c.StartGetRecords(int(pass)) +} + //export paddle_set_dataset func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int) C.int { c := get(client) @@ -121,15 +126,19 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int // paddle_next_record gets the nexts training record. // -// returns number of bytes of the records if success, -1 if failed. +// returns number of bytes of the records if success, -1 if failed, -2 if pass end. // //export paddle_next_record func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int { c := get(client) r, err := c.NextRecord() if err != nil { - // Error - // TODO: return the type of error? + // NOTE: use errors to indicate pass ends + if err.Error() == master.ErrAllTaskFailed.Error() || + err.Error() == master.ErrNoMoreAvailable.Error() || + err.Error() == master.ErrPassBefore.Error() { + return -2 + } *record = (*C.uchar)(nil) return -1 } diff --git a/go/master/client.go b/go/master/client.go index bbf3768d96..62801b9b7f 100644 --- a/go/master/client.go +++ b/go/master/client.go @@ -16,7 +16,6 @@ package master import ( "os" - "sync" "time" "github.com/PaddlePaddle/Paddle/go/connection" @@ -27,9 +26,9 @@ import ( // Client is the client of the master server. type Client struct { - conn *connection.Conn - ch chan record - initChOnce sync.Once + conn *connection.Conn + ch chan record + bufSize int } type record struct { @@ -46,11 +45,7 @@ func WithBuffer(bufSize int) func(*Client) error { if bufSize <= 0 { return nil } - - c.initChOnce.Do(func() { - c.ch = make(chan record, bufSize) - go c.getRecords() - }) + c.bufSize = bufSize return nil } } @@ -104,25 +99,41 @@ func NewClient(opts ...func(*Client) error) (*Client, error) { if err != nil { return nil, err } - } - + c.ch = make(chan record, c.bufSize) + // FIXME: connection is created asyncrosly in monitorMaster go routine, + // ensure the connection is ready for use before calling c.addClient. + time.Sleep(time.Second) return c, nil } -func (c *Client) getRecords() { +// StartGetRecords must be called at beginning of each pass +func (c *Client) StartGetRecords(passID int) { + go c.getRecords(passID) +} + +func (c *Client) getRecords(passID int) { for { - t, err := c.getTask() + t, err := c.getTask(passID) if err != nil { - log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err) - time.Sleep(3 * time.Second) - continue + if err.Error() == ErrPassBefore.Error() || + err.Error() == ErrNoMoreAvailable.Error() || + err.Error() == ErrAllTaskFailed.Error() { + c.ch <- record{nil, err} + break + } + if err.Error() == ErrPassAfter.Error() { + // wait util last pass finishes + time.Sleep(time.Second * 3) + continue + } + log.Errorf("getTask error: %s", err) } for _, chunk := range t.Chunks { - f, err := os.Open(chunk.Path) - if err != nil { - log.Errorln(err) + f, e := os.Open(chunk.Path) + if e != nil { + log.Errorln(e) continue } @@ -178,18 +189,21 @@ func (c *Client) monitorMaster(addrCh <-chan string) { } } -// SetDataset set dataset for the master server to dispatch. +// SetDataset sets dataset to dispatch for the master server. +// +// SetDataset can be call multiple times at one pass. But only the first call +// will be honored. // -// SetDataset can be call multiple times from different nodes. But -// only the first call will be honored. +// After all tasks are done, another call of SetDataset will start another pass. func (c *Client) SetDataset(globPaths []string) error { - return c.conn.Call("Service.SetDataset", globPaths, nil) + err := c.conn.Call("Service.SetDataset", globPaths, nil) + return err } // getTask gets a new task from the master server. -func (c *Client) getTask() (Task, error) { +func (c *Client) getTask(passID int) (Task, error) { var t Task - err := c.conn.Call("Service.GetTask", 0, &t) + err := c.conn.Call("Service.GetTask", passID, &t) return t, err } @@ -208,12 +222,6 @@ func (c *Client) taskFailed(meta TaskMeta) error { // NextRecord will block until the next record is available. It is // thread-safe. func (c *Client) NextRecord() ([]byte, error) { - c.initChOnce.Do(func() { - // initialize with in case WithBuffer is not used. - c.ch = make(chan record, 0) - go c.getRecords() - }) - r := <-c.ch return r.r, r.err } diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go index ee305e2c80..d5f3d79464 100644 --- a/go/master/client_internal_test.go +++ b/go/master/client_internal_test.go @@ -54,22 +54,22 @@ func TestGetFinishTask(t *testing.T) { panic(err) } go func(l net.Listener) { - s, err := NewService(&InMemStore{}, chunkPerTask, time.Second, 1) - if err != nil { - panic(err) + s, sErr := NewService(&InMemStore{}, chunkPerTask, time.Second, 1) + if sErr != nil { + panic(sErr) } server := rpc.NewServer() - err = server.Register(s) - if err != nil { - panic(err) + sErr = server.Register(s) + if sErr != nil { + panic(sErr) } mux := http.NewServeMux() mux.Handle(rpc.DefaultRPCPath, server) - err = http.Serve(l, mux) - if err != nil { - panic(err) + sErr = http.Serve(l, mux) + if sErr != nil { + panic(sErr) } }(l) @@ -103,6 +103,7 @@ func TestGetFinishTask(t *testing.T) { ch := make(chan string, 1) ch <- addr go c.monitorMaster(ch) + err = c.SetDataset([]string{path}) if err != nil { panic(err) @@ -111,44 +112,47 @@ func TestGetFinishTask(t *testing.T) { checkOnePass := func(i int) { var tasks []Task for idx := 0; idx < totalTask; idx++ { - task, err := c.getTask() - if err != nil { - t.Fatalf("Error: %v, pass: %d\n", err, i) + task, cErr := c.getTask(i) + if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() { + t.Fatalf("error: %v, pass: %d\n", cErr, i) } tasks = append(tasks, task) } - _, err = c.getTask() - if err == nil { + // getting task before task finishes should return error + _, cErr := c.getTask(i) + if cErr == nil { t.Fatalf("Should get error, pass: %d\n", i) } - err = c.taskFinished(tasks[0].Meta.ID) - if err != nil { - t.Fatalf("Error: %v, pass: %d\n", err, i) + cErr = c.taskFinished(tasks[0].Meta.ID) + if cErr != nil { + t.Fatalf("Error: %v, pass: %d\n", cErr, i) } - - err = c.taskFailed(tasks[0].Meta) - if err != nil { - t.Fatalf("Error: %v, pass: %d\n", err, i) + // call taskFailed once won't put the task to failed queue, just ensure + // the call + cErr = c.taskFailed(tasks[0].Meta) + if cErr != nil { + t.Fatalf("Error: %v, pass: %d\n", cErr, i) } tasks = tasks[1:] - task, err := c.getTask() - if err != nil { - t.Fatal(err) + _, cErr = c.getTask(i) + if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() { + t.Fatalf("Should be ErrNoMoreAvailable or ErrPassAfter: %s", cErr) } - tasks = append(tasks, task) for _, task := range tasks { - err = c.taskFinished(task.Meta.ID) - if err != nil { - t.Fatalf("Error: %v, pass: %d\n", err, i) + cErr = c.taskFinished(task.Meta.ID) + if cErr != nil { + t.Fatal(cErr) } } } for i := 0; i < 10; i++ { + // init pass data + c.StartGetRecords(i) checkOnePass(i) } } diff --git a/go/master/client_test.go b/go/master/client_test.go index a3a434ae7e..79b9cc844d 100644 --- a/go/master/client_test.go +++ b/go/master/client_test.go @@ -20,8 +20,10 @@ import ( "net/http" "net/rpc" "os" + "runtime" "strconv" "strings" + "sync" "testing" "time" @@ -29,6 +31,18 @@ import ( "github.com/PaddlePaddle/recordio" ) +// tool function for testing output goroutine ids +func goid() int { + var buf [64]byte + n := runtime.Stack(buf[:], false) + idField := strings.Fields(strings.TrimPrefix(string(buf[:n]), "goroutine "))[0] + id, err := strconv.Atoi(idField) + if err != nil { + panic(fmt.Sprintf("cannot get goroutine id: %v", err)) + } + return id +} + func TestNextRecord(t *testing.T) { const ( path = "/tmp/master_client_TestFull" @@ -45,7 +59,7 @@ func TestNextRecord(t *testing.T) { panic(err) } go func(l net.Listener) { - s, err := master.NewService(&master.InMemStore{}, 10, time.Second, 1) + s, err := master.NewService(&master.InMemStore{}, 1, time.Second*60, 1) if err != nil { panic(err) } @@ -69,7 +83,7 @@ func TestNextRecord(t *testing.T) { panic(err) } - w := recordio.NewWriter(f, -1, -1) + w := recordio.NewWriter(f, 1, -1) for i := 0; i < total; i++ { _, err = w.Write([]byte{byte(i)}) if err != nil { @@ -87,32 +101,49 @@ func TestNextRecord(t *testing.T) { panic(err) } - c, err := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(10)) - if err != nil { - panic(err) - } - - err = c.SetDataset([]string{path}) - if err != nil { - panic(err) - } - - for pass := 0; pass < 50; pass++ { - received := make(map[byte]bool) - for i := 0; i < total; i++ { - r, err := c.NextRecord() - if err != nil { - t.Fatal(pass, i, "Read error:", err) + // start several client to test task fetching + var wg sync.WaitGroup + for i := 0; i < 4; i++ { + wg.Add(1) + // test for multiple concurrent clients + go func() { + defer wg.Done() + // each go-routine needs a single client connection instance + c, e := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(1)) + if e != nil { + t.Fatal(e) } - - if len(r) != 1 { - t.Fatal(pass, i, "Length should be 1.", r) + e = c.SetDataset([]string{path}) + if e != nil { + panic(e) } - - if received[r[0]] { - t.Fatal(pass, i, "Received duplicate.", received, r) + // test for n passes + for pass := 0; pass < 10; pass++ { + c.StartGetRecords(pass) + + received := make(map[byte]bool) + taskid := 0 + for { + r, e := c.NextRecord() + if e != nil { + // ErrorPassAfter will wait, else break for next pass + if e.Error() == master.ErrPassBefore.Error() || + e.Error() == master.ErrNoMoreAvailable.Error() { + break + } + t.Fatal(pass, taskid, "Read error:", e) + } + if len(r) != 1 { + t.Fatal(pass, taskid, "Length should be 1.", r) + } + if received[r[0]] { + t.Fatal(pass, taskid, "Received duplicate.", received, r) + } + taskid++ + received[r[0]] = true + } } - received[r[0]] = true - } + }() } + wg.Wait() } diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go index ae6b6f776b..94848d887e 100644 --- a/go/master/etcd_client.go +++ b/go/master/etcd_client.go @@ -39,15 +39,12 @@ type EtcdClient struct { statePath string client *clientv3.Client lock *concurrency.Mutex + sess *concurrency.Session } // NewEtcdClient creates a new EtcdClient. func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) { log.Debugf("Connecting to etcd at %v", endpoints) - // TODO(helin): gracefully shutdown etcd store. Because etcd - // store holds a etcd lock, even though the lock will expire - // when the lease timeout, we need to implement graceful - // shutdown to release the lock. cli, err := clientv3.New(clientv3.Config{ Endpoints: endpoints, DialTimeout: dialTimeout, @@ -67,12 +64,12 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat // one master running, but split-brain problem may cause // multiple master servers running), and the cluster management // software will kill one of them. - log.Debugf("Trying to acquire lock at %s.", lockPath) + log.Infof("Trying to acquire lock at %s.", lockPath) err = lock.Lock(context.TODO()) if err != nil { return nil, err } - log.Debugf("Successfully acquired lock at %s.", lockPath) + log.Infof("Successfully acquired lock at %s.", lockPath) put := clientv3.OpPut(addrPath, addr) resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit() @@ -89,6 +86,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat statePath: statePath, client: cli, lock: lock, + sess: sess, } return e, nil @@ -157,6 +155,21 @@ func (e *EtcdClient) Load() ([]byte, error) { return state, nil } +// Shutdown shuts down the etcd client gracefully. +func (e *EtcdClient) Shutdown() error { + err := e.sess.Close() + newErr := e.client.Close() + if newErr != nil { + if err == nil { + err = newErr + } else { + log.Errorln(newErr) + } + } + + return err +} + // GetKey gets the value by the specify key. func GetKey(c *clientv3.Client, key string, timeout time.Duration) (string, error) { ctx, cancel := context.WithTimeout(context.Background(), timeout) diff --git a/go/master/inmem_store.go b/go/master/inmem_store.go index ffd663f7f0..a5bd2d4fe1 100644 --- a/go/master/inmem_store.go +++ b/go/master/inmem_store.go @@ -40,3 +40,8 @@ func (m *InMemStore) Load() ([]byte, error) { return m.buf, nil } + +// Shutdown shuts down the in mem store. +func (m *InMemStore) Shutdown() error { + return nil +} diff --git a/go/master/service.go b/go/master/service.go index d1ec8939e1..d30e9a3322 100644 --- a/go/master/service.go +++ b/go/master/service.go @@ -19,6 +19,7 @@ import ( "compress/gzip" "encoding/gob" "errors" + "math/rand" "os" "path/filepath" "sync" @@ -33,10 +34,23 @@ const ( dialTimeout = 5 * time.Second ) +// ErrAllTaskFailed occur when tasks are in done or failed state. +var ErrAllTaskFailed = errors.New("all task finished") + +// ErrNoMoreAvailable occur when no task in todo and yet not all done or fail. +var ErrNoMoreAvailable = errors.New("no more available task") + +// ErrPassBefore client side pass number does not match with master counter. +var ErrPassBefore = errors.New("pass number smaller than master") + +// ErrPassAfter client side pass number does not match with master counter. +var ErrPassAfter = errors.New("pass number larger than master") + // Store is the interface for save and load the master state. type Store interface { Save([]byte) error Load() ([]byte, error) + Shutdown() error } // Chunk is a chunk of data consisted of several data instances. @@ -75,17 +89,26 @@ type Service struct { chunksPerTask int timeoutDur time.Duration failureMax int - ready chan struct{} store Store - mu sync.Mutex - initDone bool - taskQueues taskQueues + ready chan struct{} + initDone bool + + mu sync.Mutex + taskQueues taskQueues + currPass int + jobTasks []taskEntry + savingTrainer string } func partition(chunks []Chunk, chunksPerTask int) []taskEntry { - id := 0 + // generate uniq id across job using nanosecond + randint + counter + // FIXME(typhoonzero): this is a workaround, use uuid + randStart := rand.Int() + counter := 0 + timestamp := time.Now().Nanosecond() + id := timestamp + randStart + counter if chunksPerTask <= 0 { chunksPerTask = 1 } @@ -95,7 +118,8 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry { for i, c := range chunks { if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 { cur.Task.Meta.ID = id - id++ + counter++ + id = timestamp + randStart + counter result = append(result, cur) cur.Task.Chunks = nil } @@ -266,19 +290,21 @@ func (s *Service) SetDataset(globPaths []string, _ *int) error { return err } - s.taskQueues.Todo = partition(chunks, s.chunksPerTask) + s.jobTasks = partition(chunks, s.chunksPerTask) + s.taskQueues.Todo = s.jobTasks err = s.snapshot() if err != nil { log.Errorln(err) return err } - close(s.ready) s.initDone = true return nil } +// processFailedTask retry s.failureMax times for failed task. +// return true if all task are done or failed. func (s *Service) processFailedTask(t taskEntry, epoch int) { if t.Task.Meta.Epoch != epoch { // new epoch, task launched after the @@ -302,8 +328,9 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) { return } - log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure) + log.Warningf("Task %v failed %d times, re-dispatch.", t.Task, t.NumFailure) s.taskQueues.Todo = append(s.taskQueues.Todo, t) + return } func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() { @@ -331,37 +358,30 @@ func (s *Service) logFields() log.Fields { } // GetTask gets a new task from the service. -func (s *Service) GetTask(_ int, task *Task) error { +// passID is the client side pass count +func (s *Service) GetTask(passID int, task *Task) error { select { case <-s.ready: } s.mu.Lock() defer s.mu.Unlock() + if passID < s.currPass { + return ErrPassBefore + } + if passID > s.currPass { + // Client may get run to pass after master when one client faster than the + // other + return ErrPassAfter + } if len(s.taskQueues.Todo) == 0 { - if len(s.taskQueues.Done) == 0 { - if len(s.taskQueues.Pending) == 0 { - err := errors.New("all task failed") - log.WithFields(s.logFields()).Warningln("All tasks failed.") - return err - } - - // TODO(helin): client need to retry in this - // error case. Gotcha: RPC client can't - // compare returned error with predefined - // errors like io.EOF, because the error - // instance deserialized from RPC is a - // different instance than the error defined - // in package. So we need to figure out a way - // for client to check this error correctly. - err := errors.New("no more available task") - log.WithFields(s.logFields()).Warningln("No more available task.") - return err + if len(s.taskQueues.Done) == 0 && len(s.taskQueues.Pending) == 0 { + log.WithFields(s.logFields()).Warningln("All tasks failed, may start next pass") + return ErrAllTaskFailed } - s.taskQueues.Todo = s.taskQueues.Done - s.taskQueues.Done = nil - log.WithFields(s.logFields()).Infoln("No more todo task, but trainer is requesting task to do. Move all done task to todo.") + log.WithFields(s.logFields()).Warningln("No more available task.") + return ErrNoMoreAvailable } t := s.taskQueues.Todo[0] @@ -381,7 +401,7 @@ func (s *Service) GetTask(_ int, task *Task) error { } // TaskFinished tell the service that a task is finished. -func (s *Service) TaskFinished(taskID int, _ *int) error { +func (s *Service) TaskFinished(taskID int, dummy *int) error { select { case <-s.ready: } @@ -401,11 +421,14 @@ func (s *Service) TaskFinished(taskID int, _ *int) error { delete(s.taskQueues.Pending, taskID) log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID) - - if len(s.taskQueues.Pending) == 0 && len(s.taskQueues.Todo) == 0 { - log.WithFields(s.logFields()).Infoln("No more todo and pending task, start a new pass.") - s.taskQueues.Todo = append(s.taskQueues.Todo, s.taskQueues.Done...) - s.taskQueues.Done = nil + if len(s.taskQueues.Todo) == 0 && len(s.taskQueues.Pending) == 0 { + // increase master side pass count if all tasks finished + s.currPass++ + s.taskQueues.Todo = s.jobTasks + s.taskQueues.Done = []taskEntry{} + // TODO(typhoonzero): deal with failed tasks + s.taskQueues.Failed = []taskEntry{} + log.WithFields(s.logFields()).Warningf("all task finished, add new pass data, newpass: %d.", s.currPass) } err := s.snapshot() @@ -416,7 +439,7 @@ func (s *Service) TaskFinished(taskID int, _ *int) error { } // TaskFailed tells the service that a task is failed. -func (s *Service) TaskFailed(meta TaskMeta, _ *int) error { +func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error { select { case <-s.ready: } diff --git a/go/master/service_internal_test.go b/go/master/service_internal_test.go index 69a882fc33..bd1a939a55 100644 --- a/go/master/service_internal_test.go +++ b/go/master/service_internal_test.go @@ -44,7 +44,8 @@ func TestPartionIndex(t *testing.T) { cs := make([]Chunk, 100) ts := partition(cs, 20) for i := range ts { - if ts[i].Task.Meta.ID != i { + // test auto increament ids + if i > 0 && ts[i].Task.Meta.ID != ts[i-1].Task.Meta.ID+1 { t.Error(ts[i], i) } } diff --git a/go/master/service_test.go b/go/master/service_test.go new file mode 100644 index 0000000000..5f91910ecc --- /dev/null +++ b/go/master/service_test.go @@ -0,0 +1,68 @@ +package master_test + +import ( + "os" + "testing" + "time" + + "github.com/PaddlePaddle/Paddle/go/master" + "github.com/coreos/etcd/clientv3" + "github.com/coreos/etcd/embed" + "github.com/docker/docker/pkg/ioutils" + "github.com/stretchr/testify/assert" +) + +func TestNewServiceWithEtcd(t *testing.T) { + // setup an embed etcd server + etcdDir, err := ioutils.TempDir("", "") + if err != nil { + t.Fatal(err) + } + cfg := embed.NewConfig() + cfg.Dir = etcdDir + e, err := embed.StartEtcd(cfg) + if err != nil { + t.Fatal(err) + } + defer func() { + e.Close() + if err := os.RemoveAll(etcdDir); err != nil { + t.Fatal(err) + } + }() + select { + case <-e.Server.ReadyNotify(): + t.Log("Server is ready!") + case <-time.After(60 * time.Second): + e.Server.Stop() // trigger a shutdown + t.Fatal("Server took too long to start!") + } + + ep := []string{"127.0.0.1:2379"} + masterAddr := "127.0.0.1:3306" + store, err := master.NewEtcdClient(ep, masterAddr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, 30) + if err != nil { + t.Fatal(err) + } + + _, err = master.NewService(store, 10, 10, 3) + if err != nil { + t.Fatal(err) + } + cli, err := clientv3.New(clientv3.Config{ + Endpoints: ep, + DialTimeout: 3 * time.Second, + }) + if err != nil { + t.Fatal(err) + } + v, err := master.GetKey(cli, master.DefaultAddrPath, 3*time.Second) + if err != nil { + t.Fatal(err) + } + if err := cli.Close(); err != nil { + t.Fatal(err) + } + // test master process registry itself into etcd server. + assert.Equal(t, masterAddr, v, "master process should registry itself into etcd server.") +} diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go index 0f7e20cdd8..14ad077455 100644 --- a/go/pserver/client/c/cclient.go +++ b/go/pserver/client/c/cclient.go @@ -55,10 +55,10 @@ var curHandle C.paddle_pserver_client func add(c *client.Client) C.paddle_pserver_client { mu.Lock() defer mu.Unlock() - client := curHandle + cli := curHandle curHandle++ - handleMap[client] = c - return client + handleMap[cli] = c + return cli } func get(client C.paddle_pserver_client) *client.Client { diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py index e9264592b4..85cb399590 100644 --- a/go/pserver/client/c/test/test_train.py +++ b/go/pserver/client/c/test/test_train.py @@ -6,16 +6,19 @@ import cPickle as pickle etcd_ip = os.getenv("MASTER_IP", "127.0.0.1") etcd_endpoint = "http://" + etcd_ip + ":2379" +print "connecting to master, etcd endpoints: ", etcd_endpoint +master_client = master.client(etcd_endpoint, 5, 64) def cloud_reader(): - print "connecting to master, etcd endpoints: ", etcd_endpoint - master_client = master.client(etcd_endpoint, 5, 64) + global master_client master_client.set_dataset( - ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*-of-*"]) + ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*"], passes=30) while 1: r, e = master_client.next_record() if not r: + if e != -2: # other errors + print "get record error:", e break yield pickle.loads(r) @@ -27,10 +30,12 @@ def main(): # network config x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) y_predict = paddle.layer.fc(input=x, - param_attr=paddle.attr.Param(name='w'), + param_attr=paddle.attr.Param( + name='w', learning_rate=1e-3), size=1, act=paddle.activation.Linear(), - bias_attr=paddle.attr.Param(name='b')) + bias_attr=paddle.attr.Param( + name='b', learning_rate=1e-3)) y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) cost = paddle.layer.mse_cost(input=y_predict, label=y) @@ -38,9 +43,8 @@ def main(): parameters = paddle.parameters.create(cost) # create optimizer of new remote updater to pserver - optimizer = paddle.optimizer.Momentum(momentum=0) + optimizer = paddle.optimizer.Momentum(momentum=0, learning_rate=1e-3) - print "etcd endoint: ", etcd_endpoint trainer = paddle.trainer.SGD(cost=cost, parameters=parameters, update_equation=optimizer, @@ -51,6 +55,8 @@ def main(): # event_handler to print training and testing info def event_handler(event): if isinstance(event, paddle.event.EndIteration): + # FIXME: for cloud data reader, pass number is managed by master + # should print the server side pass number if event.batch_id % 100 == 0: print "Pass %d, Batch %d, Cost %f" % ( event.pass_id, event.batch_id, event.cost) diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go index 98ff8ce827..4fb2630766 100644 --- a/go/pserver/etcd_client.go +++ b/go/pserver/etcd_client.go @@ -34,16 +34,19 @@ const ( PsPath = "/ps/" // PsCheckpoint is the etcd path for store checkpoints information PsCheckpoint = "/checkpoints/" + + retryTimeout = 5 * time.Second ) // EtcdClient is the etcd client that the pserver uses for fault // tolerance, service registry and coordination. type EtcdClient struct { - numPservers int - etcdEndpoints string - etcdClient *clientv3.Client - // etcdTimeout is also used as retry intervals. - etcdTimeout time.Duration + numPservers int + endpoints string + client *clientv3.Client + sess *concurrency.Session + dialTimeout time.Duration + ttlSec int // FIXME: ensure GetExternalIP gets the correct ip for trainers to connect. externalIP string // desired number of pservers in the job. @@ -52,11 +55,12 @@ type EtcdClient struct { } // NewEtcdClient creates an EtcdClient -func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *EtcdClient { +func NewEtcdClient(endpoints string, numPservers int, dialtimeout time.Duration, ttlSec int) *EtcdClient { return &EtcdClient{ - etcdTimeout: timeout, - numPservers: numPservers, - etcdEndpoints: endpoints, + dialTimeout: dialtimeout, + ttlSec: ttlSec, + numPservers: numPservers, + endpoints: endpoints, } } @@ -64,7 +68,6 @@ func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *Et // // Register returns the index of the current pserver. func (e *EtcdClient) Register(port int) (int, error) { - var err error e.externalIP, err = networkhelper.GetExternalIP() if err != nil { @@ -72,19 +75,26 @@ func (e *EtcdClient) Register(port int) (int, error) { } // initialize connection to etcd. - ep := strings.Split(e.etcdEndpoints, ",") + ep := strings.Split(e.endpoints, ",") for { cli, err := clientv3.New(clientv3.Config{ Endpoints: ep, - DialTimeout: e.etcdTimeout, + DialTimeout: e.dialTimeout, }) if err != nil { log.Errorf("connect to etcd error: %v", err) - time.Sleep(e.etcdTimeout) + time.Sleep(retryTimeout) + continue + } + e.client = cli + sess, err := concurrency.NewSession(cli, concurrency.WithTTL(e.ttlSec)) + if err != nil { + log.Errorf("create etcd session error: %v", err) + time.Sleep(retryTimeout) continue } - e.etcdClient = cli - log.Debugf("inited client to %s", e.etcdEndpoints) + e.sess = sess + log.Debugf("inited client to %s", e.endpoints) break } // init /ps_desired using transaction, for multiple pservers may want to write @@ -95,7 +105,7 @@ func (e *EtcdClient) Register(port int) (int, error) { cancel() if err != nil { log.Warn(err) - time.Sleep(e.etcdTimeout) + time.Sleep(retryTimeout) continue } break @@ -106,18 +116,18 @@ func (e *EtcdClient) Register(port int) (int, error) { // wait and set s.desired init value for { ctx, cancel := context.WithTimeout(context.Background(), time.Second) - resp, err := e.etcdClient.Get(ctx, PsDesired) + resp, err := e.client.Get(ctx, PsDesired) cancel() if err != nil { log.Errorf("getting %s error: %v", PsDesired, err) - time.Sleep(e.etcdTimeout) + time.Sleep(retryTimeout) continue } if len(resp.Kvs) != 0 { e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value)) if err != nil { log.Errorf("value of %s invalid %v\n", PsDesired, err) - time.Sleep(e.etcdTimeout) + time.Sleep(retryTimeout) // NOTE: wait util ps_desired value change continue } @@ -134,7 +144,7 @@ func (e *EtcdClient) Register(port int) (int, error) { cancel() if err != nil { log.Warn(err) - time.Sleep(e.etcdTimeout) + time.Sleep(retryTimeout) continue } break @@ -144,10 +154,10 @@ func (e *EtcdClient) Register(port int) (int, error) { } func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) { - return concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error { + return concurrency.NewSTM(e.client, func(c concurrency.STM) error { dsStr := c.Get(PsDesired) if dsStr == "" { - c.Put(PsDesired, strconv.Itoa(numPservers)) + c.Put(PsDesired, strconv.Itoa(numPservers), clientv3.WithLease(e.sess.Lease())) } return nil }, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads)) @@ -156,7 +166,7 @@ func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) ( // registerPserverEtcd registers pserver node on etcd using transaction. func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) { var idx int - _, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error { + _, err := concurrency.NewSTM(e.client, func(c concurrency.STM) error { registered := false for i := 0; i < e.desired; i++ { psKey := PsPath + strconv.Itoa(i) @@ -165,26 +175,10 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er log.Debugf("got value (%s) for key: %s", ps, psKey) if ps == "" { - resp, err := e.etcdClient.Grant(context.TODO(), 5) - if err != nil { - log.Fatal(err) - } // find the first id and write info pserverAddr := e.externalIP + ":" + strconv.Itoa(port) - c.Put(psKey, pserverAddr, clientv3.WithLease(resp.ID)) + c.Put(psKey, pserverAddr, clientv3.WithLease(e.sess.Lease())) log.Debugf("set pserver node %s with value %s", psKey, pserverAddr) - ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID) - if kaerr != nil { - log.Errorf("keepalive etcd node error: %v", kaerr) - return kaerr - } - - // Eat the keep alive message so etcd - // will not expire the lease. - go func(ch <-chan *clientv3.LeaseKeepAliveResponse) { - ka := <-ch - log.Debugf("keepalive: %d\n", ka.TTL) - }(ch) log.Debug("register finished") idx = i registered = true @@ -207,7 +201,7 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er // GetKey gets the value by the specified key func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) { ctx, cancel := context.WithTimeout(context.Background(), timeout) - resp, err := e.etcdClient.Get(ctx, key) + resp, err := e.client.Get(ctx, key) cancel() if err != nil { return []byte{}, err @@ -223,7 +217,27 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) { // PutKey put into etcd with value by key specified func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) error { ctx, cancel := context.WithTimeout(context.Background(), timeout) - _, err := e.etcdClient.Put(ctx, key, string(value)) + _, err := e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease())) cancel() return err } + +// Shutdown shuts down the etcd client gracefully. +func (e *EtcdClient) Shutdown() error { + var err error + if e.sess != nil { + err = e.sess.Close() + } + + if e.client != nil { + newErr := e.client.Close() + if newErr != nil { + if err != nil { + log.Errorln(newErr) + } else { + err = newErr + } + } + } + return err +} diff --git a/paddle/api/Evaluator.cpp b/paddle/api/Evaluator.cpp index 681e3a3809..fcda6eaf03 100644 --- a/paddle/api/Evaluator.cpp +++ b/paddle/api/Evaluator.cpp @@ -37,7 +37,7 @@ std::vector Evaluator::getNames() const { double Evaluator::getValue(const std::string name) const { paddle::Error err; double v = m->rawPtr->getValue(name, &err); - if (err) { + if (!err.isOK()) { throw std::runtime_error(err.msg()); } return v; diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 433edbfda7..21cb7c7265 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -3,7 +3,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) -cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory) +cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context) cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) @@ -29,7 +29,5 @@ py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) -proto_library(net_proto SRCS net_proto.proto DEPS op_proto) -# cc_library(net SRCS net.cc DEPS operator net_proto op_registry fc_op) -cc_library(net SRCS net.cc DEPS operator net_proto op_registry) +cc_library(net SRCS net.cc DEPS op_registry) cc_test(net_op_test SRCS net_op_test.cc DEPS net add_op mul_op sigmoid_op softmax_op fc_op) diff --git a/paddle/framework/detail/tensor-inl.h b/paddle/framework/detail/tensor-inl.h new file mode 100644 index 0000000000..e7ff09dd5c --- /dev/null +++ b/paddle/framework/detail/tensor-inl.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/memory/memcpy.h" + +namespace paddle { +namespace framework { + +template +inline void Tensor::check_memory_size() const { + PADDLE_ENFORCE(holder_ != nullptr, + "Tenosr holds no memory. Call Tensor::mutable_data first."); + PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_, + "Tensor's dims_ is out of bound. Call Tensor::mutable_data " + "first to re-allocate memory."); +} + +template +inline const T* Tensor::data() const { + check_memory_size(); + return reinterpret_cast( + reinterpret_cast(holder_->ptr()) + offset_); +} + +template +inline T* Tensor::data() { + check_memory_size(); + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +template +inline T* Tensor::mutable_data(DDim dims, platform::Place place) { + static_assert(std::is_pod::value, "T must be POD"); + Resize(dims); + return mutable_data(place); +} + +template +inline T* Tensor::mutable_data(platform::Place place) { + static_assert(std::is_pod::value, "T must be POD"); + PADDLE_ENFORCE(product(dims_) > 0, + "Tensor's numel must be larger than zero to call " + "Tensor::mutable_data. Call Tensor::set_dim first."); + /* some versions of boost::variant don't have operator!= */ + size_t size = product(dims_) * sizeof(T); + if (holder_ == nullptr || !(holder_->place() == place) || + holder_->size() < size + offset_) { + if (platform::is_cpu_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size)); + } +#ifndef PADDLE_ONLY_CPU + else if (platform::is_gpu_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size)); + } +#endif + offset_ = 0; + } + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +template +inline void Tensor::ShareDataWith(const Tensor& src) { + src.check_memory_size(); + *this = src; +} + +template +inline void Tensor::CopyFrom(const Tensor& src, + const platform::Place& dst_place) { + src.check_memory_size(); + Resize(src.dims()); + + auto src_place = src.holder_->place(); + auto src_ptr = static_cast(src.data()); + + auto dst_ptr = static_cast(mutable_data(dst_place)); + + auto size = product(src.dims_) * sizeof(T); + + if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size); + } +#ifndef PADDLE_ONLY_CPU + else if (platform::is_gpu_place(src_place) && + platform::is_cpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size, 0); + } else if (platform::is_cpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size, 0); + } else if (platform::is_gpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size, 0); + } + +#endif +} + +template +inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const { + check_memory_size(); + PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero."); + PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound."); + PADDLE_ENFORCE(begin_idx < end_idx, + "Begin index must be less than end index."); + PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1."); + int base = product(dims_) / dims_[0]; + Tensor dst; + dst.holder_ = holder_; + DDim dst_dims = dims_; + dst_dims[0] = end_idx - begin_idx; + dst.Resize(dst_dims); + dst.offset_ = offset_ + begin_idx * base * sizeof(T); + return dst; +} + +inline void Tensor::Resize(const DDim& dims) { dims_ = dims; } + +inline const DDim& Tensor::dims() const { return dims_; } + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc index bc23b63b35..2cd378c6b2 100644 --- a/paddle/framework/net.cc +++ b/paddle/framework/net.cc @@ -20,17 +20,7 @@ namespace paddle { namespace framework { -std::shared_ptr AddBackwardOp(std::shared_ptr ForwardOps) { - auto grad_ops = std::make_shared(); - for (auto& op : ForwardOps->ops_) { - auto op_grad = OpRegistry::CreateGradOp(op); - grad_ops->AddOp(op_grad); - } - grad_ops->CompleteAddOp(); - return grad_ops; -} - -void PlainNet::CompleteAddOp(bool calc) { +void NetOp::CompleteAddOp(bool calc) { add_op_done_ = true; if (!calc) return; std::unordered_set input_set; @@ -70,7 +60,7 @@ void PlainNet::CompleteAddOp(bool calc) { attrs_["temporary_index"] = tmp_index; } -std::string PlainNet::DebugString() const { +std::string NetOp::DebugString() const { std::ostringstream os; os << OperatorBase::DebugString() << std::endl; for (auto& op : ops_) { @@ -82,5 +72,7 @@ std::string PlainNet::DebugString() const { return os.str(); } +bool NetOp::IsNetOp() const { return true; } + } // namespace framework } // namespace paddle diff --git a/paddle/framework/net.h b/paddle/framework/net.h index 3264f1f565..089c135595 100644 --- a/paddle/framework/net.h +++ b/paddle/framework/net.h @@ -37,21 +37,7 @@ namespace framework { * This is the base class of network, all the networks should implement the APIs * it defines. */ -class Net : public OperatorBase { - public: - virtual void AddOp(const std::shared_ptr& op) = 0; - virtual void CompleteAddOp(bool calc) = 0; -}; - -using NetPtr = std::shared_ptr; - -/** - * @brief a basic implementation of Net. - * - * PlainNet is a very simple Net, it create a list of operators, and run them - * sequentially following the order they added. - */ -class PlainNet : public Net { +class NetOp : public OperatorBase { public: /** * Infer all the operators' input and output variables' shapes, will be called @@ -80,15 +66,17 @@ class PlainNet : public Net { /** * @brief Add an operator by ptr */ - void AddOp(const std::shared_ptr& op) override { + void AddOp(const std::shared_ptr& op) { PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed"); ops_.push_back(op); } - void CompleteAddOp(bool calculate = true) override; + void CompleteAddOp(bool calculate = true); std::string DebugString() const override; + bool IsNetOp() const override; + std::vector> ops_; private: @@ -100,7 +88,5 @@ class PlainNet : public Net { } }; -std::shared_ptr AddBackwardOp(std::shared_ptr ForwardOps); - } // namespace framework } // namespace paddle diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc index 20b42cbb49..8048311fe5 100644 --- a/paddle/framework/net_op_test.cc +++ b/paddle/framework/net_op_test.cc @@ -40,7 +40,7 @@ void AssertSameVectorWithoutOrder(const std::vector& expected, } TEST(OpKernel, all) { - auto net = std::make_shared(); + auto net = std::make_shared(); ASSERT_NE(net, nullptr); auto op1 = std::make_shared(); @@ -69,30 +69,23 @@ TEST(OpKernel, all) { net->Run(scope, dev_ctx); ASSERT_EQ(2, infer_shape_cnt); ASSERT_EQ(2, run_cnt); - ASSERT_THROW(net->AddOp(op2), std::runtime_error); -} -TEST(AddBackwardOp, TestGradOp) { - auto net = std::make_shared(); - ASSERT_NE(net, nullptr); - net->AddOp(framework::OpRegistry::CreateOp("mul", {"X", "Y"}, {"Out"}, {})); - net->AddOp( - framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {"Out"}, {})); - net->AddOp(framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {""}, {})); - auto grad_ops = AddBackwardOp(net); - for (auto& op : grad_ops->ops_) { - op->DebugString(); - } + ASSERT_THROW(net->AddOp(op2), paddle::platform::EnforceNotMet); } -// TODO(zhihong): add fc grad without registering. -// TEST(AddBackwardOp, TestNoGradOp) { -// auto net = std::make_shared(); -// ASSERT_NE(net, nullptr); -// net->AddOp(framework::OpRegistry::CreateOp("fc", {"X", "W", "b"}, {"Y"}, -// {})); auto grad_ops = AddBackwardOp(net); for (auto& op : grad_ops->ops_) { -// op->DebugString(); -// } -// } +//! TODO(yuyang18): Refine Backward Op. +// TEST(AddBackwardOp, TestGradOp) { +// auto net = std::make_shared(); +// ASSERT_NE(net, nullptr); +// net->AddOp(framework::OpRegistry::CreateOp("mul", {"X", "Y"}, {"Out"}, {})); +// net->AddOp( +// framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {"Out"}, {})); +// net->AddOp(framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {""}, +// {})); +// auto grad_ops = AddBackwardOp(net); +// for (auto& op : grad_ops->ops_) { +// op->DebugString(); +// } +//} } // namespace framework } // namespace paddle diff --git a/paddle/framework/net_proto.proto b/paddle/framework/net_proto.proto deleted file mode 100644 index 0779f49fe2..0000000000 --- a/paddle/framework/net_proto.proto +++ /dev/null @@ -1,15 +0,0 @@ -syntax="proto2"; -package paddle.framework; - -import "op_proto.proto"; - -message NetDesc { - // network identification - optional string name = 1; - // operator contains in network - repeated OpProto operators = 2; - // network type to run with. e.g "plainNet", "DAG" - optional string net_type = 3; - // num worker always - optional int32 num_workers = 4; -} diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc index 05095372d8..2ef781bf86 100644 --- a/paddle/framework/op_registry_test.cc +++ b/paddle/framework/op_registry_test.cc @@ -90,7 +90,7 @@ TEST(OpRegistry, IllegalAttr) { bool caught = false; try { paddle::framework::OpRegistry::CreateOp(op_desc); - } catch (std::runtime_error& err) { + } catch (paddle::platform::EnforceNotMet err) { caught = true; std::string msg = "larger_than check fail"; const char* err_msg = err.what(); @@ -136,7 +136,7 @@ TEST(OpRegistry, CustomChecker) { bool caught = false; try { paddle::framework::OpRegistry::CreateOp(op_desc); - } catch (std::runtime_error& err) { + } catch (paddle::platform::EnforceNotMet err) { caught = true; std::string msg = "Attribute 'test_attr' is required!"; const char* err_msg = err.what(); @@ -154,7 +154,7 @@ TEST(OpRegistry, CustomChecker) { caught = false; try { paddle::framework::OpRegistry::CreateOp(op_desc); - } catch (std::runtime_error& err) { + } catch (paddle::platform::EnforceNotMet err) { caught = true; std::string msg = "'test_attr' must be even!"; const char* err_msg = err.what(); @@ -192,7 +192,7 @@ TEST(ProtoMaker, DuplicatedAttr) { pd::OpProto op_proto; pd::OpAttrChecker op_checker; auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker); - ASSERT_THROW(proto_maker.Validate(), std::runtime_error); + ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); } class TestInOutProtoMaker : public pd::OpProtoAndCheckerMaker { @@ -208,5 +208,5 @@ TEST(ProtoMaker, DuplicatedInOut) { pd::OpProto op_proto; pd::OpAttrChecker op_checker; auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker); - ASSERT_THROW(proto_maker.Validate(), std::runtime_error); + ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); } diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 1e57e9a20f..3a1ffc0215 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -34,22 +34,26 @@ KernelContext::GetEigenDevice() const { #endif const std::string& OperatorBase::Input(const std::string& name) const { + PADDLE_ENFORCE(in_out_idxs_ != nullptr, + "Input Output Indices could not be nullptr"); auto it = in_out_idxs_->find(name); PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_", name); - if (attrs_.count("input_format") == 0) { - return inputs_[it->second]; + return inputs_.at((size_t)it->second); } else { const auto& input_format = GetAttr>("input_format"); int idx = input_format[it->second]; - return inputs_.at(idx); + return inputs_.at((size_t)idx); } } std::vector OperatorBase::Inputs(const std::string& name) const { + PADDLE_ENFORCE(in_out_idxs_ != nullptr, "IO Idx could not be nullptr"); auto input_format = GetAttr>("input_format"); auto offset = in_out_idxs_->at(name); + PADDLE_ENFORCE(input_format.at((size_t)offset + 1) <= inputs_.size(), + "Input Out Of Range"); return std::vector{ inputs_.begin() + input_format.at(offset), @@ -57,23 +61,25 @@ std::vector OperatorBase::Inputs(const std::string& name) const { } const std::string& OperatorBase::Output(const std::string& name) const { + PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr"); auto it = in_out_idxs_->find(name); PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_", name); - if (attrs_.count("output_format") == 0) { - return outputs_[it->second]; + return outputs_.at((size_t)it->second); } else { const auto& output_format = GetAttr>("output_format"); int idx = output_format[it->second]; - return outputs_.at(idx); + return outputs_.at((size_t)idx); } } std::vector OperatorBase::Outputs(const std::string& name) const { + PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr"); auto output_format = GetAttr>("output_format"); auto offset = in_out_idxs_->at(name); - + PADDLE_ENFORCE(output_format.at((size_t)offset + 1) <= outputs_.size(), + "Output Out of Range"); return std::vector{ outputs_.begin() + output_format.at(offset), outputs_.begin() + output_format.at(offset + 1)}; diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index 97e9ec1bcf..0a8c82ee47 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -90,15 +90,17 @@ class OperatorBase { virtual void Run(const std::shared_ptr& scope, const platform::DeviceContext& dev_ctx) const = 0; - // Get a input with argument's name described in `op_proto` + virtual bool IsNetOp() const { return false; } + + //! Get a input with argument's name described in `op_proto` const std::string& Input(const std::string& name) const; - // Get a input which has multiple variables. - // TODO add a vector_view to prevent memory copy. + //! Get a input which has multiple variables. + //! TODO add a vector_view to prevent memory copy. std::vector Inputs(const std::string& name) const; - // Get a output with argument's name described in `op_proto` + //! Get a output with argument's name described in `op_proto` const std::string& Output(const std::string& name) const; - // Get an output which has multiple variables. - // TODO add a vector_view to prevent memory copy. + //! Get an output which has multiple variables. + //! TODO add a vector_view to prevent memory copy. std::vector Outputs(const std::string& name) const; public: @@ -199,8 +201,6 @@ class OperatorWithKernel : public OperatorBase { place_ = dev_ctx.GetPlace(); } - // bool operator==(const OpKernelKey& o) const { return place_ == o.place_; - // } bool operator==(const OpKernelKey& o) const { return platform::places_are_same_class(place_, o.place_); } diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h index 79c9ffd1a6..4faaf84144 100644 --- a/paddle/framework/scope.h +++ b/paddle/framework/scope.h @@ -56,7 +56,9 @@ class Scope { if (var) { return var; } else { - vars_[name] = std::unique_ptr(new Variable()); + auto ptr = new Variable(); + name_to_var_[name] = std::unique_ptr(ptr); + var_to_name_[ptr] = name; return GetVariable(name); } } @@ -68,8 +70,8 @@ class Scope { * from it's parent scope. Return nullptr if not found. */ Variable* GetVariable(const std::string& name) const { - auto it = vars_.find(name); - if (it != vars_.end()) { + auto it = name_to_var_.find(name); + if (it != name_to_var_.end()) { return it->second.get(); } else if (parent_ != nullptr) { return parent_->GetVariable(name); @@ -84,12 +86,21 @@ class Scope { * Find if there is a Variable in this scope and it's parent scope */ bool HasVariable(const std::string& name) const { - return (vars_.find(name) != vars_.end() || + return (name_to_var_.find(name) != name_to_var_.end() || (parent_ && parent_->HasVariable(name))); } + std::string GetVariableName(Variable* const var) const { + try { + return var_to_name_.at(var); + } catch (...) { + return ""; + } + } + private: - std::unordered_map> vars_; + std::unordered_map var_to_name_; + std::unordered_map> name_to_var_; std::shared_ptr parent_{nullptr}; }; diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc index df1afb200c..ff069c7be0 100644 --- a/paddle/framework/scope_test.cc +++ b/paddle/framework/scope_test.cc @@ -40,6 +40,11 @@ TEST(Scope, Create) { /// already exist. Variable* var4 = scope->CreateVariable("a"); EXPECT_EQ(var4, var2); + + EXPECT_EQ("a", scope->GetVariableName(var4)); + Scope scope2; + auto var = scope2.CreateVariable("tmp"); + EXPECT_EQ("", scope->GetVariableName(var)); } TEST(Scope, Parent) { diff --git a/paddle/framework/tensor.cc b/paddle/framework/tensor.cc index 964f15ab66..ea7b2a1f7b 100644 --- a/paddle/framework/tensor.cc +++ b/paddle/framework/tensor.cc @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include +#include "paddle/framework/tensor.h" namespace paddle { namespace framework {} diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index 5f07256c05..d9ceedb453 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/framework/ddim.h" #include "paddle/memory/memcpy.h" #include "paddle/memory/memory.h" +#include "paddle/platform/device_context.h" #include "paddle/platform/enforce.h" #include "paddle/platform/place.h" #include "unsupported/Eigen/CXX11/Tensor" @@ -32,9 +33,11 @@ template struct CastToPyBufferImpl; } // namespace details } // namespace pybind + namespace framework { class Tensor { + public: template friend struct paddle::pybind::details::CastToPyBufferImpl; @@ -47,151 +50,123 @@ class Tensor { public: Tensor() : offset_(0) {} + /*! Return a pointer to mutable memory block. */ template - const T* data() const { - EnforceSufficientMemory(); - return reinterpret_cast( - reinterpret_cast(holder_->ptr()) + offset_); - } + inline T* data(); + /*! Return a pointer to constant memory block. */ template - T* data() { - EnforceSufficientMemory(); - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); - } - - template ::value>::type* = nullptr> - T* mutable_data(DDim dims, platform::Place place) { - Resize(dims); - return mutable_data(place); - } - - template ::value>::type* = nullptr> - T* mutable_data(platform::Place place) { - PADDLE_ENFORCE(product(dims_) > 0, - "Tensor's numel must be larger than zero to call " - "Tensor::mutable_data. Call Tensor::set_dim first."); - if (holder_ == nullptr || - !(holder_->place() == - place) /* some versions of boost::variant don't have operator!= */ - || holder_->size() < product(dims_) * sizeof(T) + offset_) { - if (platform::is_cpu_place(place)) { - holder_.reset(new PlaceholderImpl( - boost::get(place), product(dims_) * sizeof(T))); - } else if (platform::is_gpu_place(place)) { -#ifdef PADDLE_ONLY_CPU - PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); -#else - holder_.reset(new PlaceholderImpl( - boost::get(place), product(dims_) * sizeof(T))); -#endif - } else { - PADDLE_THROW("Unknown 'place'."); - } - offset_ = 0; - } - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); - } + inline const T* data() const; + /** + * @brief Return a pointer to mutable memory block. + * @note If not exist, then allocation. + */ template - void ShareDataWith(const Tensor& src) { - src.EnforceSufficientMemory(); - *this = src; - } + inline T* mutable_data(platform::Place place); + + /** + * @brief Return a pointer to mutable memory block. + * + * @param[in] dims The dimensions of the memory block. + * @param[in] place The place of the memory block. + * + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(DDim dims, platform::Place place); + + /*! Return the dimensions of the memory block. */ + inline const DDim& dims() const; + + /*! Resize the dimensions of the memory block. */ + inline void Resize(const DDim& dims); + /*! The internal of two tensors share the same memory block. */ template - void CopyFrom(const Tensor& src, platform::Place dst_place) { - PADDLE_ENFORCE(platform::is_cpu_place(dst_place), - "Tensor::CopyFrom only support dst CPU now."); - size_t size = product(src.dims_) * sizeof(T); - Resize(src.dims()); - const void* src_ptr = static_cast(src.data()); - void* dst_ptr = static_cast(mutable_data(dst_place)); - if (paddle::platform::is_cpu_place(holder_->place())) { - std::memcpy(dst_ptr, src_ptr, size); - } else if (paddle::platform::is_gpu_place(holder_->place())) { -#ifdef PADDLE_ONLY_CPU - PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); -#else - platform::GpuMemcpySync(dst_ptr, src_ptr, size, cudaMemcpyDeviceToHost); -#endif - } - } + inline void ShareDataWith(const Tensor& src); + + /** + * @brief Copy the content of external tensor to a new place. + * + * @param[in] src The external tensor. + * @param[in] ctx The device context contains place where to store. + * + * @note CopyFrom supports CPU <-> GPU, GPU <-> GPU. + */ + template + inline void CopyFrom(const Tensor& src, const platform::Place& dst_place); + + /** + * @brief Return the slice of the tensor. + * + * @param[in] begin_idx The begin index of the slice. + * @param[in] end_idx The end index of the slice. + */ + inline Tensor Slice(const int& begin_idx, const int& end_idx) const; + private: template - Tensor Slice(const int& begin_idx, const int& end_idx) const { - EnforceSufficientMemory(); - PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero."); - PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound."); - PADDLE_ENFORCE(begin_idx < end_idx, - "Begin index must be less than end index."); - PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1."); - int base = product(dims_) / dims_[0]; - Tensor dst; - dst.holder_ = holder_; - DDim dst_dims = dims_; - dst_dims[0] = end_idx - begin_idx; - dst.Resize(dst_dims); - dst.offset_ = offset_ + begin_idx * base * sizeof(T); - return dst; - } - - void Resize(const DDim& dims) { dims_ = dims; } - - const DDim& dims() const { return dims_; } + inline void check_memory_size() const; paddle::platform::Place place() const { return holder_->place(); } private: - // Placeholder hides type T, so it doesn't appear as a template - // parameter of Variable. + /** + * @note Placeholder hides type T, so it doesn't appear as a template + * parameter of Variable. + */ struct Placeholder { virtual ~Placeholder() {} virtual void* ptr() const = 0; - virtual platform::Place place() const = 0; virtual size_t size() const = 0; virtual std::type_index type() const = 0; + virtual platform::Place place() const = 0; }; - template + template struct PlaceholderImpl : public Placeholder { - PlaceholderImpl(PlaceType place, size_t size) + PlaceholderImpl(Place place, size_t size) : ptr_(static_cast(memory::Alloc(place, size)), - memory::PODDeleter(place)), + memory::PODDeleter(place)), place_(place), - size_(size) {} + size_(size) { + PADDLE_ENFORCE(ptr_ != nullptr, "Insufficient %s memory to allocation.", + is_cpu_place(place_) ? "CPU" : "GPU"); + } - virtual void* ptr() const { return static_cast(ptr_.get()); } virtual size_t size() const { return size_; } - virtual paddle::platform::Place place() const { return place_; } + virtual platform::Place place() const { return place_; } + virtual void* ptr() const { return static_cast(ptr_.get()); } virtual std::type_index type() const { return std::type_index(typeid(T)); } - std::unique_ptr> ptr_; - platform::Place place_; // record the place of ptr_. - size_t size_; // size of the memory block. + /*! the pointer of memory block. */ + std::unique_ptr> ptr_; + + /*! the place of memory block. */ + platform::Place place_; + + /*! the size of memory block. */ + size_t size_; }; - template - inline void EnforceSufficientMemory() const { - PADDLE_ENFORCE(holder_ != nullptr, - "Tenosr holds no memory. Call Tensor::mutable_data first."); - PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_, - "Tensor's dims_ is out of bound. Call Tensor::mutable_data " - "first to re-allocate memory."); - } - - std::shared_ptr holder_; // holds the memory block if allocated. + /*! holds the memory block if allocated. */ + std::shared_ptr holder_; + + /*! points to dimensions of memory block. */ DDim dims_; - // A PlaceHolder may be shared by more than one tensor. Some of them may be - // slices of the others. So the offset_ is introduced here to indicate the - // byte offset between PlaceHolder::ptr_ and where tensor's data really - // begins. + + /** + * @brief A PlaceHolder may be shared by more than one tensor. + * + * @note Some of them may be slices of the others. So the offset_ + * is introduced here to indicate the byte offset between + * PlaceHolder::ptr_ and where the tensor data really begins. + */ size_t offset_; }; } // namespace framework } // namespace paddle + +#include "paddle/framework/detail/tensor-inl.h" diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index 089844dc01..ef1cc10b84 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -33,7 +33,7 @@ TEST(Tensor, DataAssert) { bool caught = false; try { src_tensor.data(); - } catch (std::runtime_error& err) { + } catch (paddle::platform::EnforceNotMet err) { caught = true; std::string msg = "Tenosr holds no memory. Call Tensor::mutable_data first."; @@ -72,7 +72,8 @@ TEST(Tensor, MutableData) { p2 = src_tensor.mutable_data(make_ddim({2, 2}), CPUPlace()); EXPECT_EQ(p1, p2); } -#ifdef __CUDACC__ + +#ifndef PADDLE_ONLY_CPU { Tensor src_tensor; float* p1 = nullptr; @@ -107,7 +108,7 @@ TEST(Tensor, ShareDataWith) { bool caught = false; try { dst_tensor.ShareDataWith(src_tensor); - } catch (std::runtime_error& err) { + } catch (paddle::platform::EnforceNotMet err) { caught = true; std::string msg = "Tenosr holds no memory. Call Tensor::mutable_data first."; @@ -123,7 +124,7 @@ TEST(Tensor, ShareDataWith) { ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } -#ifdef __CUDACC__ +#ifndef PADDLE_ONLY_CPU { Tensor src_tensor; Tensor dst_tensor; @@ -160,7 +161,7 @@ TEST(Tensor, Slice) { EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); } -#ifdef __CUDACC__ +#ifndef PADDLE_ONLY_CPU { Tensor src_tensor; src_tensor.mutable_data(make_ddim({6, 9}), GPUPlace()); @@ -188,25 +189,74 @@ TEST(Tensor, Slice) { TEST(Tensor, CopyFrom) { using namespace paddle::framework; using namespace paddle::platform; + { + Tensor src_tensor; + Tensor dst_tensor; + + int* src_ptr = src_tensor.mutable_data(make_ddim({3, 3}), CPUPlace()); + + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); - Tensor src_tensor; - int* src_ptr = src_tensor.mutable_data(make_ddim({3, 3}), CPUPlace()); - int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; - memcpy(src_ptr, arr, 9 * sizeof(int)); - Tensor dst_tensor; - dst_tensor.CopyFrom(src_tensor, CPUPlace()); - const int* dst_ptr = dst_tensor.data(); - ASSERT_NE(src_ptr, dst_ptr); - for (size_t i = 0; i < 9; ++i) { - EXPECT_EQ(src_ptr[i], dst_ptr[i]); + auto cpu_place = new paddle::platform::CPUPlace(); + dst_tensor.CopyFrom(src_tensor, *cpu_place); + + const int* dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + Tensor slice_tensor = src_tensor.Slice(1, 2); + dst_tensor.CopyFrom(slice_tensor, *cpu_place); + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + ASSERT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } } +#ifndef PADDLE_ONLY_CPU + { + Tensor src_tensor; + Tensor gpu_tensor; + Tensor dst_tensor; + + int* src_ptr = src_tensor.mutable_data(make_ddim({3, 3}), CPUPlace()); + + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); + + // CPU Tensor to GPU Tensor + auto gpu_place = new paddle::platform::GPUPlace(0); + gpu_tensor.CopyFrom(src_tensor, *gpu_place); + + // GPU Tensor to CPU Tensor + auto cpu_place = new paddle::platform::CPUPlace(); + dst_tensor.CopyFrom(gpu_tensor, *cpu_place); + + // Compare Tensors + const int* dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + Tensor slice_tensor = src_tensor.Slice(1, 2); + + // CPU Slice Tensor to GPU Tensor + gpu_tensor.CopyFrom(slice_tensor, *gpu_place); - Tensor slice_tensor = src_tensor.Slice(1, 2); - dst_tensor.CopyFrom(slice_tensor, CPUPlace()); - const int* slice_ptr = slice_tensor.data(); - dst_ptr = dst_tensor.data(); - ASSERT_NE(dst_ptr, slice_ptr); - for (size_t i = 0; i < 3; ++i) { - EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + // GPU Tensor to CPU Tensor + dst_tensor.CopyFrom(gpu_tensor, *cpu_place); + + // Compare Slice Tensors + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + ASSERT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } } +#endif } diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp index a40530f413..81cc3c890b 100644 --- a/paddle/gserver/activations/ActivationFunction.cpp +++ b/paddle/gserver/activations/ActivationFunction.cpp @@ -207,8 +207,8 @@ Error __must_check backward(Argument& act) { argument_.value->setData(act.value->getData() + offset, 1UL, size); argument_.grad->setData(act.grad->getData() + offset, 1UL, size); - Error status = softmax_.backward(argument_); - if (!status) return status; + Error err = softmax_.backward(argument_); + if (!err.isOK()) return err; } return Error(); } diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc index 27c1b4033b..bb44970109 100644 --- a/paddle/memory/detail/buddy_allocator.cc +++ b/paddle/memory/detail/buddy_allocator.cc @@ -27,12 +27,11 @@ BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator, system_allocator_(std::move(system_allocator)) {} BuddyAllocator::~BuddyAllocator() { - DLOG(INFO) << "BuddyAllocator Disconstructor makes sure that all of these " - "have actually been freed"; + VLOG(3) << "BuddyAllocator Disconstructor makes sure that all of these " + "have actually been freed"; while (!pool_.empty()) { auto block = static_cast(std::get<2>(*pool_.begin())); - DLOG(INFO) << "Free from block (" << block << ", " << max_chunk_size_ - << ")"; + VLOG(3) << "Free from block (" << block << ", " << max_chunk_size_ << ")"; system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); cache_.invalidate(block); @@ -52,12 +51,11 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { // acquire the allocator lock std::lock_guard lock(mutex_); - DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size " - << size; + VLOG(3) << "Allocate " << unaligned_size << " bytes from chunk size " << size; // if the allocation is huge, send directly to the system allocator if (size > max_chunk_size_) { - DLOG(INFO) << "Allocate from system allocator."; + VLOG(3) << "Allocate from system allocator."; return SystemAlloc(size); } @@ -72,9 +70,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { return nullptr; } } else { - DLOG(INFO) << "Allocation from existing memory block " << std::get<2>(*it) - << " at address " - << reinterpret_cast(std::get<2>(*it))->data(); + VLOG(3) << "Allocation from existing memory block " << std::get<2>(*it) + << " at address " + << reinterpret_cast(std::get<2>(*it))->data(); } total_used_ += size; @@ -91,10 +89,10 @@ void BuddyAllocator::Free(void* p) { // Acquire the allocator lock std::lock_guard lock(mutex_); - DLOG(INFO) << "Free from address " << block; + VLOG(3) << "Free from address " << block; if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) { - DLOG(INFO) << "Free directly from system allocator"; + VLOG(3) << "Free directly from system allocator"; system_allocator_->Free(block, block->total_size(cache_), block->index(cache_)); @@ -111,8 +109,8 @@ void BuddyAllocator::Free(void* p) { // Trying to merge the right buddy if (block->has_right_buddy(cache_)) { - DLOG(INFO) << "Merging this block " << block << " with its right buddy " - << block->right_buddy(cache_); + VLOG(3) << "Merging this block " << block << " with its right buddy " + << block->right_buddy(cache_); auto right_buddy = block->right_buddy(cache_); @@ -129,8 +127,8 @@ void BuddyAllocator::Free(void* p) { // Trying to merge the left buddy if (block->has_left_buddy(cache_)) { - DLOG(INFO) << "Merging this block " << block << " with its left buddy " - << block->left_buddy(cache_); + VLOG(3) << "Merging this block " << block << " with its left buddy " + << block->left_buddy(cache_); auto left_buddy = block->left_buddy(cache_); @@ -146,8 +144,8 @@ void BuddyAllocator::Free(void* p) { } // Dumping this block into pool - DLOG(INFO) << "Inserting free block (" << block << ", " - << block->total_size(cache_) << ")"; + VLOG(3) << "Inserting free block (" << block << ", " + << block->total_size(cache_) << ")"; pool_.insert( IndexSizeAddress(block->index(cache_), block->total_size(cache_), block)); @@ -166,7 +164,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) { size_t index = 0; void* p = system_allocator_->Alloc(index, size); - DLOG(INFO) << "Allocated " << p << " from system allocator."; + VLOG(3) << "Allocated " << p << " from system allocator."; if (p == nullptr) return nullptr; @@ -192,8 +190,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { if (p == nullptr) return pool_.end(); - DLOG(INFO) << "Creating and inserting new block " << p - << " from system allocator"; + VLOG(3) << "Creating and inserting new block " << p + << " from system allocator"; static_cast(p)->init(cache_, MemoryBlock::FREE_CHUNK, index, max_chunk_size_, nullptr, nullptr); @@ -237,19 +235,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, auto block = static_cast(std::get<2>(*it)); pool_.erase(it); - DLOG(INFO) << "Split block (" << block << ", " << block->total_size(cache_) - << ") into"; + VLOG(3) << "Split block (" << block << ", " << block->total_size(cache_) + << ") into"; block->split(cache_, size); - DLOG(INFO) << "Left block (" << block << ", " << block->total_size(cache_) - << ")"; + VLOG(3) << "Left block (" << block << ", " << block->total_size(cache_) + << ")"; block->set_type(cache_, MemoryBlock::ARENA_CHUNK); // the rest of memory if exist if (block->has_right_buddy(cache_)) { if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) { - DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", " - << block->right_buddy(cache_)->total_size(cache_) << ")"; + VLOG(3) << "Insert right block (" << block->right_buddy(cache_) << ", " + << block->right_buddy(cache_)->total_size(cache_) << ")"; pool_.insert( IndexSizeAddress(block->right_buddy(cache_)->index(cache_), @@ -276,7 +274,7 @@ void BuddyAllocator::CleanIdleFallBackAlloc() { return; } - DLOG(INFO) << "Return block " << block << " to fallback allocator."; + VLOG(3) << "Return block " << block << " to fallback allocator."; system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); cache_.invalidate(block); @@ -312,7 +310,7 @@ void BuddyAllocator::CleanIdleNormalAlloc() { MemoryBlock* block = static_cast(std::get<2>(*pool)); - DLOG(INFO) << "Return block " << block << " to base allocator."; + VLOG(3) << "Return block " << block << " to base allocator."; system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); cache_.invalidate(block); diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc index 098931c887..aaab1142ca 100644 --- a/paddle/memory/memcpy.cc +++ b/paddle/memory/memcpy.cc @@ -35,7 +35,7 @@ void Copy(platform::CPUPlace dst_place, platform::GPUPlace src_place, const void* src, size_t num, cudaStream_t stream) { - platform::GPUPlaceGuard g(src_place.device); + platform::SetDeviceId(src_place.device); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); } @@ -45,7 +45,7 @@ void Copy(platform::GPUPlace dst_place, platform::CPUPlace src_place, const void* src, size_t num, cudaStream_t stream) { - platform::GPUPlaceGuard g(dst_place.device); + platform::SetDeviceId(dst_place.device); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); } @@ -56,7 +56,7 @@ void Copy(platform::GPUPlace dst_place, const void* src, size_t num, cudaStream_t stream) { if (dst_place == src_place) { - platform::GPUPlaceGuard g(src_place.device); + platform::SetDeviceId(src_place.device); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); } else { platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num, diff --git a/paddle/memory/memcpy.h b/paddle/memory/memcpy.h index 99b1c2e1c3..2b9c0eada6 100644 --- a/paddle/memory/memcpy.h +++ b/paddle/memory/memcpy.h @@ -20,13 +20,39 @@ limitations under the License. */ namespace paddle { namespace memory { +/** + * \brief Copy memory from one place to another place. + * + * \param[in] DstPlace Destination allocation place (CPU). + * \param[in] dst Destination memory address. + * \param[in] SrcPlace Source allocation place (CPU). + * \param[in] src Source memory address. + * \param[in] num memory size in bytes to copy. + * + */ template void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); #ifndef PADDLE_ONLY_CPU + +/** + * \brief Copy memory from one place to another place. + * + * \param[in] DstPlace Destination allocation place (CPU or GPU). + * \param[in] dst Destination memory address. + * \param[in] SrcPlace Source allocation place (CPU or GPU). + * \param[in] src Source memory address. + * \param[in] num memory size in bytes to copy. + * \param[in] stream CUDA stream. + * + * \note For GPU memory copy, CUDA stream need to be specified + * for asynchronously memory copy. + * + */ template void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, cudaStream_t stream); + #endif // PADDLE_ONLY_CPU } // namespace memory diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index c2e046926f..207025f9b1 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -60,6 +60,7 @@ detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { platform::GpuMaxChunkSize()); } } + platform::SetDeviceId(gpu_id); return as[gpu_id]; } diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h index 5e0d647072..44f567caf9 100644 --- a/paddle/memory/memory.h +++ b/paddle/memory/memory.h @@ -20,19 +20,53 @@ limitations under the License. */ namespace paddle { namespace memory { +/** + * \brief Allocate memory block in one place. + * + * \param[in] place Allocation place (CPU or GPU). + * \param[in] size Allocation size. + * + * \return Allocated memory block address. + * + * \note If return nullptr, it indicates memory allocation failed + * because insufficient memory in current system. When Alloc + * function is invoked, you must check the returned memory + * address is valid or not. + */ template -void* Alloc(Place, size_t); +void* Alloc(Place place, size_t size); +/** + * \brief Free memory block in one place. + * + * \param[in] place Allocation place (CPU or GPU). + * \param[in] ptr Memory block address to free. + * + */ template -void Free(Place, void*); +void Free(Place place, void* ptr); +/** + * \brief Total size of used memory in one place. + * + * \param[in] place Allocation place (CPU or GPU). + * + */ template -size_t Used(Place); +size_t Used(Place place); -template ::value>::type* = nullptr> +/** + * \brief Free memory block in one place. + * + * \note In some cases, custom deleter is used to + * deallocate the memory automatically for + * std::unique_ptr in tensor.h. + * + */ +template class PODDeleter { + static_assert(std::is_pod::value, "T must be POD"); + public: PODDeleter(Place place) : place_(place) {} void operator()(T* ptr) { Free(place_, static_cast(ptr)); } diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 0a14dc2114..5085e1b925 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -54,3 +54,8 @@ op_library(fc_op SRCS fc_op.cc DEPS mul_op rowwise_add_op sigmoid_op softmax_op net) op_library(sgd_op SRCS sgd_op.cc sgd_op.cu) + +op_library(recurrent_network_op SRCS recurrent_network_op.cc DEPS op_desc +tensor op_registry operator net) +cc_test(recurrent_network_op_test SRCS recurrent_network_op_test.cc DEPS +recurrent_network_op gtest mul_op add_op) diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc index 8d415fbd2e..1424b02843 100644 --- a/paddle/operators/add_op.cc +++ b/paddle/operators/add_op.cc @@ -13,17 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/add_op.h" -#include "paddle/framework/op_registry.h" -#include "paddle/framework/tensor.h" namespace paddle { namespace operators { -class AddOp : public framework::OperatorWithKernel { +class AddOp : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override { + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override { PADDLE_ENFORCE(inputs.size() == 2, "Input size of AddOp must be two"); PADDLE_ENFORCE(outputs.size() == 1, "Output size of AddOp must be one"); PADDLE_ENFORCE( @@ -35,10 +32,10 @@ protected: } }; -class AddOpMaker : public framework::OpProtoAndCheckerMaker { +class AddOpMaker : public OpProtoAndCheckerMaker { public: - AddOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) - : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The first input of add op"); AddInput("Y", "The second input of add op"); AddOutput("Out", "The output of add op"); @@ -50,11 +47,10 @@ The equation is: Out = X + Y } }; -class AddOpGrad : public framework::OperatorWithKernel { +class AddOpGrad : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override {} + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override {} std::string DebugString() const override { LOG(INFO) << "AddOpGrad"; return ""; @@ -64,7 +60,6 @@ protected: } // namespace operators } // namespace paddle -REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker); -REGISTER_GRADIENT_OP(add_two, add_two_grad, paddle::operators::AddOpGrad); -REGISTER_OP_CPU_KERNEL( - add_two, paddle::operators::AddKernel); +REGISTER_OP(add_two, ops::AddOp, ops::AddOpMaker); +REGISTER_GRADIENT_OP(add_two, add_two_grad, ops::AddOpGrad); +REGISTER_OP_CPU_KERNEL(add_two, ops::AddKernel); diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu index 2e5a755f92..79d8de6cd4 100644 --- a/paddle/operators/add_op.cu +++ b/paddle/operators/add_op.cu @@ -1,5 +1,4 @@ -#include "paddle/operators/add_op.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/add_op.h" -REGISTER_OP_GPU_KERNEL(add_two, - paddle::operators::AddKernel); \ No newline at end of file +REGISTER_OP_GPU_KERNEL(add_two, ops::AddKernel); diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h index 39d54a63bd..0c39433788 100644 --- a/paddle/operators/add_op.h +++ b/paddle/operators/add_op.h @@ -13,27 +13,24 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "glog/logging.h" -#include "paddle/framework/eigen.h" -#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace operators { template -class AddKernel : public framework::OpKernel { +class AddKernel : public OpKernel { public: - void Compute(const framework::KernelContext& context) const override { - auto input0 = context.Input(0)->Get(); - auto input1 = context.Input(1)->Get(); - auto* output = context.Output(0)->GetMutable(); + void Compute(const KernelContext& context) const override { + auto input0 = context.Input(0)->Get(); + auto input1 = context.Input(1)->Get(); + auto output = context.Output(0)->GetMutable(); output->mutable_data(context.GetPlace()); - framework::EigenVector::Flatten(*output).device( + EigenVector::Flatten(*output).device( *(context.GetEigenDevice())) = - framework::EigenVector::Flatten(input0) + - framework::EigenVector::Flatten(input1); + EigenVector::Flatten(input0) + EigenVector::Flatten(input1); } }; diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc index 7d7bb09f3d..46c88d4d1a 100644 --- a/paddle/operators/cross_entropy_op.cc +++ b/paddle/operators/cross_entropy_op.cc @@ -13,17 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/cross_entropy_op.h" -#include "paddle/framework/op_registry.h" -#include "paddle/framework/tensor.h" namespace paddle { namespace operators { -class OnehotCrossEntropyOp : public framework::OperatorWithKernel { +class OnehotCrossEntropyOp : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override { + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override { PADDLE_ENFORCE(inputs.size() == 2, "Input size of OnehotCrossEntropyOp must be two"); PADDLE_ENFORCE(outputs.size() == 1, @@ -35,15 +32,14 @@ protected: PADDLE_ENFORCE(inputs[0]->dims().size() == 2, "X's dimension must be 2."); PADDLE_ENFORCE(outputs[0]->dims().size() == 1, "label's dimension must be 1."); - outputs[0]->Resize(framework::make_ddim({inputs[0]->dims()[0]})); + outputs[0]->Resize({inputs[0]->dims()[0]}); } }; -class OnehotCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { +class OnehotCrossEntropyOpMaker : public OpProtoAndCheckerMaker { public: - OnehotCrossEntropyOpMaker(framework::OpProto *proto, - framework::OpAttrChecker *op_checker) - : framework::OpProtoAndCheckerMaker(proto, op_checker) { + OnehotCrossEntropyOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The first input of OnehotCrossEntropyOp"); AddInput("label", "The second input of OnehotCrossEntropyOp"); AddOutput("Y", "The output of OnehotCrossEntropyOp"); @@ -59,9 +55,7 @@ OnehotCrossEntropy Operator. } // namespace paddle REGISTER_OP(onehot_cross_entropy, - paddle::operators::OnehotCrossEntropyOp, - paddle::operators::OnehotCrossEntropyOpMaker); -REGISTER_OP_CPU_KERNEL( - onehot_cross_entropy, - paddle::operators::OnehotCrossEntropyOpKernel<::paddle::platform::CPUPlace, - float>); + ops::OnehotCrossEntropyOp, + ops::OnehotCrossEntropyOpMaker); +REGISTER_OP_CPU_KERNEL(onehot_cross_entropy, + ops::OnehotCrossEntropyOpKernel); diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu index 1bcdcb7ea6..19e4b74596 100644 --- a/paddle/operators/cross_entropy_op.cu +++ b/paddle/operators/cross_entropy_op.cu @@ -1,6 +1,4 @@ #include "paddle/operators/cross_entropy_op.h" -#include "paddle/framework/op_registry.h" REGISTER_OP_GPU_KERNEL(onehot_cross_entropy, - paddle::operators::OnehotCrossEntropyOpKernel< - ::paddle::platform::GPUPlace, float>); \ No newline at end of file + ops::OnehotCrossEntropyOpKernel); \ No newline at end of file diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h index ad2c7f34e1..0383df46be 100644 --- a/paddle/operators/cross_entropy_op.h +++ b/paddle/operators/cross_entropy_op.h @@ -13,23 +13,21 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "glog/logging.h" -#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace operators { template -class OnehotCrossEntropyOpKernel : public framework::OpKernel { +class OnehotCrossEntropyOpKernel : public OpKernel { public: constexpr T LOG_THRESHOLD() const { return static_cast(1e-20); } - void Compute(const framework::KernelContext& context) const override { - auto X = context.Input(0)->Get(); + void Compute(const KernelContext& context) const override { + auto X = context.Input(0)->Get(); const T* X_data = X.data(); - const int* label_data = - context.Input(1)->Get().data(); - auto* Y = context.Output(0)->GetMutable(); + const int* label_data = context.Input(1)->Get().data(); + auto* Y = context.Output(0)->GetMutable(); Y->mutable_data(context.GetPlace()); diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc index 01e96f4c48..c4a9f5937f 100644 --- a/paddle/operators/fc_op.cc +++ b/paddle/operators/fc_op.cc @@ -12,41 +12,38 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/framework/net.h" -#include "paddle/framework/op_registry.h" -#include "paddle/framework/operator.h" +#include "type_alias.h" namespace paddle { namespace operators { -class FullyConnectedOp : public framework::PlainNet { +class FullyConnectedOp : public NetOp { public: void Init() override { - AddOp(framework::OpRegistry::CreateOp("mul", - { - Input("X"), Input("W"), - }, - {Output("before_act")}, - {})); + AddOp(OpRegistry::CreateOp("mul", + { + Input("X"), Input("W"), + }, + {Output("before_act")}, + {})); auto b = Input("b"); - if (b != framework::OperatorBase::EMPTY_VAR_NAME()) { - AddOp(framework::OpRegistry::CreateOp("rowwise_add", - {Output("before_act"), Input("b")}, - {Output("before_act")}, - {})); + if (b != EMPTY_VAR_NAME()) { + AddOp(OpRegistry::CreateOp("rowwise_add", + {Output("before_act"), Input("b")}, + {Output("before_act")}, + {})); } auto activation = GetAttr("activation"); - AddOp(framework::OpRegistry::CreateOp( + AddOp(OpRegistry::CreateOp( activation, {Output("before_act")}, {Output("Y")}, {})); CompleteAddOp(false); } }; -class FullyConnectedOpMaker : public framework::OpProtoAndCheckerMaker { +class FullyConnectedOpMaker : public OpProtoAndCheckerMaker { public: - FullyConnectedOpMaker(framework::OpProto *proto, - framework::OpAttrChecker *op_checker) + FullyConnectedOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "the input of fc operator"); AddInput("W", "the weight of fc operator"); @@ -71,6 +68,4 @@ USE_OP(rowwise_add); USE_OP(sigmoid); USE_OP(softmax); -REGISTER_OP(fc, - paddle::operators::FullyConnectedOp, - paddle::operators::FullyConnectedOpMaker); +REGISTER_OP(fc, ops::FullyConnectedOp, ops::FullyConnectedOpMaker); diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index cd74c8b976..22c1b78005 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -13,17 +13,14 @@ limitations under the License. */ #include "paddle/operators/mul_op.h" -#include "paddle/framework/op_registry.h" -#include "paddle/framework/tensor.h" namespace paddle { namespace operators { -class MulOp : public framework::OperatorWithKernel { +class MulOp : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override { + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override { PADDLE_ENFORCE(inputs.size() == 2, "The mul op must take two inputs"); auto dim0 = inputs[0]->dims(); auto dim1 = inputs[1]->dims(); @@ -37,10 +34,10 @@ protected: } }; -class MulOpMaker : public framework::OpProtoAndCheckerMaker { +class MulOpMaker : public OpProtoAndCheckerMaker { public: - MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) - : framework::OpProtoAndCheckerMaker(proto, op_checker) { + MulOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The first input of mul op"); AddInput("Y", "The second input of mul op"); AddOutput("Out", "The output of mul op"); @@ -52,11 +49,10 @@ The equation is: Out = X * Y } }; -class MulOpGrad : public framework::OperatorWithKernel { +class MulOpGrad : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override {} + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override {} std::string DebugString() const override { LOG(INFO) << "MulGrad"; return ""; @@ -66,8 +62,7 @@ protected: } // namespace operators } // namespace paddle -REGISTER_OP(mul, paddle::operators::MulOp, paddle::operators::MulOpMaker); -REGISTER_GRADIENT_OP(mul, mul_grad, paddle::operators::MulOpGrad); +REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker); +REGISTER_GRADIENT_OP(mul, mul_grad, ops::MulOpGrad); -REGISTER_OP_CPU_KERNEL( - mul, paddle::operators::MulKernel); +REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu index 3ee581dc77..c27fc886ce 100644 --- a/paddle/operators/mul_op.cu +++ b/paddle/operators/mul_op.cu @@ -13,8 +13,5 @@ limitations under the License. */ #include "paddle/operators/mul_op.h" -#include "paddle/framework/op_registry.h" -REGISTER_OP_GPU_KERNEL(mul, - paddle::operators::MulKernel); \ No newline at end of file +REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); \ No newline at end of file diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index e6bad7fb9d..4679750446 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -14,30 +14,27 @@ #pragma once -#include "glog/logging.h" -#include "paddle/framework/eigen.h" -#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace operators { template -class MulKernel : public framework::OpKernel { +class MulKernel : public OpKernel { public: - void Compute(const framework::KernelContext& context) const override { + void Compute(const KernelContext& context) const override { Eigen::array, 1> dim_pair = { {Eigen::IndexPair(1, 0)}}; - auto input0 = context.Input(0)->Get(); - auto input1 = context.Input(1)->Get(); - auto* output = context.Output(0)->GetMutable(); + auto input0 = context.Input(0)->Get(); + auto input1 = context.Input(1)->Get(); + auto* output = context.Output(0)->GetMutable(); output->mutable_data(context.GetPlace()); - framework::EigenMatrix::From(*output).device( - *(context.GetEigenDevice())) = - framework::EigenMatrix::From(input0).contract( - framework::EigenMatrix::From(input1), dim_pair); + EigenMatrix::From(*output).device(*(context.GetEigenDevice())) = + EigenMatrix::From(input0).contract(EigenMatrix::From(input1), + dim_pair); } }; } // namespace operators diff --git a/paddle/operators/recurrent_network_op.cc b/paddle/operators/recurrent_network_op.cc new file mode 100644 index 0000000000..1a101d6ddf --- /dev/null +++ b/paddle/operators/recurrent_network_op.cc @@ -0,0 +1,419 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/recurrent_network_op.h" + +#include +#include +#include + +#include "paddle/framework/net.h" +#include "paddle/framework/op_registry.h" +#include "paddle/platform/enforce.h" + +namespace paddle { +namespace operators { + +namespace rnn { + +void SegmentInputs(std::vector>& step_scopes, + const std::vector& inlinks, + const size_t seq_len) { + PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided."); + for (size_t i = 0; i < inlinks.size(); ++i) { + Tensor* input = + step_scopes[0]->GetVariable(inlinks[i].external)->GetMutable(); + DDim dims = input->dims(); + PADDLE_ENFORCE(static_cast(dims[0]) == seq_len, + "all the inlinks must have same length"); + DDim step_dims = slice_ddim(dims, 1, dims.size()); + for (size_t j = 0; j < seq_len; j++) { + Tensor* step_input = step_scopes[j] + ->CreateVariable(inlinks[i].internal) + ->GetMutable(); + *step_input = input->Slice(j, j + 1); + step_input->Resize(step_dims); + } + } +} + +void ConcatOutputs(std::vector>& step_scopes, + const std::vector& outlinks, + const size_t seq_len) { + for (size_t i = 0; i < outlinks.size(); i++) { + Tensor* output = + step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable(); + + // TODO(qingiqng) remove following code after adding + // InferShape in RecurrentGradientOp + DDim step_dims = step_scopes[0] + ->GetVariable(outlinks[i].internal) + ->GetMutable() + ->dims(); + std::vector dims_vec = vectorize(step_dims); + dims_vec.insert(dims_vec.begin(), seq_len); + output->mutable_data(make_ddim(dims_vec), platform::CPUPlace()); + + for (size_t j = 0; j < seq_len; j++) { + Tensor* step_output = step_scopes[j] + ->GetVariable(outlinks[i].internal) + ->GetMutable(); + // TODO(luotao02) data type and platform::DeviceContext() should set + // correctly + (output->Slice(j, j + 1)) + .CopyFrom(*step_output, platform::CPUPlace()); + } + } +} + +void LinkMemories(std::vector>& scopes, + const std::vector& memories, + size_t step_id, + int offset) { + PADDLE_ENFORCE(step_id < scopes.size(), + "step [%d] is out of range of step scopes' size [%d]", + step_id, + scopes.size()); + PADDLE_ENFORCE(static_cast(step_id) + offset >= 0, + "offset [%d] must be large than -[%d]", + offset, + step_id); + PADDLE_ENFORCE(step_id + offset < scopes.size(), + "offset [%d] is out of range, it must be less than (%d - %d)", + offset, + scopes.size(), + step_id); + std::shared_ptr scope = scopes[step_id]; + std::shared_ptr linked_scope = scopes[step_id + offset]; + for (auto& attr : memories) { + auto mem = scope->CreateVariable(attr.pre_var)->GetMutable(); + // maybe share variable is better? + auto linked_mem = linked_scope->GetVariable(attr.var)->GetMutable(); + mem->ShareDataWith(*linked_mem); + + // TODO(qingqing) remove following code + // the memory of current step should be allocated in step net + auto m = scope->CreateVariable(attr.var)->GetMutable(); + // for unit test, as addOp and mulOp are null currently, if not + // mutable_data, mem.data() in output will be error. We will + // remove this line after merge the correct addOp and mulOp. + m->mutable_data(mem->dims(), platform::CPUPlace()); + } +} + +void InitArgument(const ArgumentName& name, + Argument* arg, + const OperatorBase& op) { + arg->step_net = op.Input(name.step_net); + arg->step_scopes = op.Output(name.step_scopes); + + auto inlinks = op.Inputs(name.inlinks); + auto inlink_alias = op.GetAttr>(name.inlink_alias); + PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(), + "the size of inlinks and inlink_alias don't match:%d,%d", + inlinks.size(), + inlink_alias.size()); + for (size_t i = 0; i < inlinks.size(); ++i) { + rnn::Link link; + link.external = inlinks[i]; + link.internal = inlink_alias[i]; + (arg->inlinks).push_back(link); + } + + auto outlinks = op.Outputs(name.outlinks); + auto outlink_alias = op.GetAttr>(name.outlink_alias); + PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(), + "the size of outlinks and outlink_alias don't match:%d,%d", + outlinks.size(), + outlink_alias.size()); + for (size_t i = 0; i < outlinks.size(); ++i) { + rnn::Link link; + link.external = outlinks[i]; + link.internal = outlink_alias[i]; + (arg->outlinks).push_back(link); + } + + auto boot_memories = op.Inputs(name.boot_memories); + + // attributes + auto memories = op.GetAttr>(name.memories); + auto pre_memories = op.GetAttr>(name.pre_memories); + + PADDLE_ENFORCE(memories.size() == boot_memories.size(), + "the size of memories, boot_memories don't match:%d,%d", + memories.size(), + boot_memories.size()); + PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(), + "the size of pre_memories, boot_memories don't match:%d,%d", + pre_memories.size(), + boot_memories.size()); + PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set"); + + for (size_t i = 0; i < memories.size(); ++i) { + rnn::MemoryAttr mem_attr; + mem_attr.var = memories[i]; + mem_attr.pre_var = pre_memories[i]; + mem_attr.boot_var = boot_memories[i]; + (arg->memories).push_back(mem_attr); + } +} + +} // namespace rnn + +void RecurrentAlgorithm::InferShape(const std::shared_ptr& scope) const { + seq_len_ = scope->GetVariable((arg_->inlinks[0]).external) + ->GetMutable() + ->dims()[0]; + CreateScopes(scope); + auto step_scopes = GetStepScopes(scope); + + // SegmentInputs is called in InferShape. The input must hold memory in + // SegmentInputs. But the other op only set dimension for the output in + // InferShape. That's a problem. Wether the RNN op needs InferShape or not? + // Wether the following functions (SegmentInputs, InitMemories, ...) need + // to rewrite for RNN op? + rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_); + + InitMemories(step_scopes[0]); + + PADDLE_ENFORCE(scope->HasVariable(arg_->step_net), + "stepnet [%s] is not in scope.", + arg_->step_net); + Variable* net = scope->GetVariable(arg_->step_net); + PADDLE_ENFORCE(net != nullptr, "failed to get step net"); + // If the InferShape is called in OperatorBase's run function, + // the rnn op only needs to do InferShape for the first time step + for (size_t i = 0; i < seq_len_; i++) { + if (i > 0) { + rnn::LinkMemories(step_scopes, arg_->memories, i, -1); + } + net->GetMutable()->InferShape(step_scopes[i]); + } + + auto outlinks = arg_->outlinks; + for (size_t i = 0; i < outlinks.size(); i++) { + DDim step_dims = step_scopes[0] + ->GetVariable(outlinks[i].internal) + ->GetMutable() + ->dims(); + std::vector dims_vec = vectorize(step_dims); + // now only support fixed length + dims_vec.insert(dims_vec.begin(), seq_len_); + Tensor* output = + step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable(); + output->Resize(make_ddim(dims_vec)); + } +} + +void RecurrentAlgorithm::Run(const std::shared_ptr& scope, + const platform::DeviceContext& dev_ctx) const { + auto step_scopes = GetStepScopes(scope); + + Variable* net = scope->GetVariable(arg_->step_net); + for (size_t step_id = 0; step_id < seq_len_; step_id++) { + // the link memory is done in InferShape + // maybe remove following code after testing + if (step_id > 0) { + rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1); + } + net->GetMutable()->Run(step_scopes[step_id], dev_ctx); + } + + rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_); +} + +void RecurrentAlgorithm::CreateScopes(std::shared_ptr scope) const { + // TODO(xxx) Only two scopes are needed for inference, this case will be + // supported later. + auto step_scopes = scope->GetVariable(arg_->step_scopes) + ->GetMutable>>(); + + if (seq_len_ > step_scopes->size()) { + for (size_t i = step_scopes->size(); i < seq_len_; ++i) { + std::shared_ptr step_scope = std::make_shared(scope); + + // Now all variables in scope must be created outside of op. + auto net_op = scope->GetVariable(arg_->step_net)->GetMutable(); + for (auto& input : net_op->inputs_) { + step_scope->CreateVariable(input); + } + for (auto& output : net_op->outputs_) { + step_scope->CreateVariable(output); + } + + step_scopes->push_back(std::make_shared(step_scope)); + } + } +} + +void RecurrentAlgorithm::InitMemories(std::shared_ptr step_scope) const { + for (auto& attr : arg_->memories) { + Tensor* pre_mem = + step_scope->CreateVariable(attr.pre_var)->GetMutable(); + PADDLE_ENFORCE(step_scope->HasVariable(attr.boot_var), + "memory [%s]'s boot variable [%s] not exists", + attr.var, + attr.boot_var); + Tensor* boot_mem = + step_scope->GetVariable(attr.boot_var)->GetMutable(); + pre_mem->ShareDataWith(*boot_mem); + + // TODO(qingqing) remove following code + // the memory of current step should be allocated in step net + // here for unit test + auto cur_step_mem = + step_scope->CreateVariable(attr.var)->GetMutable(); + cur_step_mem->mutable_data(boot_mem->dims(), platform::CPUPlace()); + } +} + +const rnn::ArgumentName RecurrentOp::kArgName{"step_net", + "step_scopes", + "inlinks", + "outlinks", + "inlink_alias", + "outlink_alias", + "memories", + "pre_memories", + "boot_memories"}; + +const rnn::ArgumentName RecurrentGradientOp::kArgName{"step_net", + "step_scopes", + "outlink@grad", + "inlink@grad", + "inlink_alias", + "outlink_alias", + "memories", + "pre_memories", + "boot_memories@grad"}; + +void RecurrentOp::Init() { + OperatorBase::Init(); + std::unique_ptr arg(new rnn::Argument()); + rnn::InitArgument(kArgName, arg.get(), *this); + alg_.Init(std::move(arg)); +} + +class RecurrentAlgorithmProtoAndCheckerMaker : public OpProtoAndCheckerMaker { +public: + RecurrentAlgorithmProtoAndCheckerMaker(OpProto* proto, + OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + const auto& name = RecurrentOp::kArgName; + // inputs and outputs stored in proto + AddInputs(name.inlinks, + "the input that need to be segmented for each step."); + AddInputs(name.boot_memories, "variables to initialize memories."); + AddInput(name.step_net, "network shared by all steps."); + + AddOutputs(name.outlinks, + "the output that need to concated for all steps."); + AddOutput(name.step_scopes, "step scopes"); + + // Attributes stored in AttributeMap + AddAttr>(name.inlink_alias, "alias of inlinks"); + AddAttr>(name.outlink_alias, "alias of outlinks"); + AddAttr>(name.pre_memories, + "names of pre-memories"); + AddAttr>(name.memories, "names of memories"); + + AddComment("This is a recurrent group operator."); + } +}; + +void RecurrentGradientAlgorithm::Run( + const std::shared_ptr& scope, + const platform::DeviceContext& dev_ctx) const { + auto step_scopes = GetStepScopes(scope); + rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_); + PADDLE_ENFORCE(scope->HasVariable(arg_->step_net), + "step net is not in scope."); + Variable* net = scope->GetVariable(arg_->step_net); + PADDLE_ENFORCE(net != nullptr, "failed to get step net"); + for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) { + if (static_cast(step_id) != seq_len_ - 1) { + rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1); + } + net->GetMutable()->Run(step_scopes[step_id], dev_ctx); + } + LinkBootMemoryGradients(step_scopes[0]); + rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_); +} + +void RecurrentGradientAlgorithm::LinkBootMemoryGradients( + std::shared_ptr step_scope) const { + for (auto& attr : arg_->memories) { + Tensor* mem_grad = + step_scope->CreateVariable(attr.var)->GetMutable(); + PADDLE_ENFORCE(mem_grad != nullptr, + "boot_tensor should be retrieved before"); + PADDLE_ENFORCE(step_scope->HasVariable(attr.boot_var), + "memory [%s]'s boot variable [%s] not exists", + attr.var, + attr.boot_var); + Tensor* boot_mem_grad = + step_scope->CreateVariable(attr.boot_var)->GetMutable(); + boot_mem_grad->ShareDataWith(*mem_grad); + } +} + +void RecurrentGradientAlgorithm::InferShape( + const std::shared_ptr& scope) const { + seq_len_ = scope->GetVariable((arg_->inlinks[0]).external) + ->GetMutable() + ->dims()[0]; + auto step_scopes = GetStepScopes(scope); + rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_); + + PADDLE_ENFORCE(scope->HasVariable(arg_->step_net), + "step net is not in scope."); + Variable* net = scope->GetVariable(arg_->step_net); + PADDLE_ENFORCE(net != nullptr, "failed to get step net"); + + for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) { + if (static_cast(step_id) != seq_len_ - 1) { + rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1); + } + net->GetMutable()->InferShape(step_scopes[step_id]); + } + + auto outlinks = arg_->outlinks; + for (size_t i = 0; i < outlinks.size(); i++) { + DDim step_dims = step_scopes[0] + ->GetVariable(outlinks[i].internal) + ->GetMutable() + ->dims(); + std::vector dims_vec = vectorize(step_dims); + // now only support fixed length + dims_vec.insert(dims_vec.begin(), seq_len_); + Tensor* output = + step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable(); + output->Resize(make_ddim(dims_vec)); + } + LinkBootMemoryGradients(step_scopes[0]); +} + +void RecurrentGradientOp::Init() { + OperatorBase::Init(); + std::unique_ptr arg(new rnn::Argument()); + rnn::InitArgument(kArgName, arg.get(), *this); + alg_.Init(std::move(arg)); +} + +} // namespace operators +} // namespace paddle + +REGISTER_OP(recurrent_op, + paddle::operators::RecurrentOp, + paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker); diff --git a/paddle/operators/recurrent_network_op.h b/paddle/operators/recurrent_network_op.h new file mode 100644 index 0000000000..8946c8ce38 --- /dev/null +++ b/paddle/operators/recurrent_network_op.h @@ -0,0 +1,216 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/framework/operator.h" + +namespace paddle { +namespace operators { + +using namespace paddle::framework; + +namespace rnn { + +/** + * Memory of a RNN (same as the role of `Momory` in PaddlePaddle). + * + * Memory attributes cached by this op, dims will be infered from + * boot memories in father scope. Other attributes are copied from Op's proto + * attributes. + */ +struct MemoryAttr { + // name of current state variable + std::string var; + // name of previous step's state variable + std::string pre_var; + // name of the variables to init this memory (same role of `boot_layer` in + // PaddlePaddle), which is store in father's scope. + std::string boot_var; +}; + +struct Link { + // input or output links name. + std::string internal; + // alias to avoid duplicate keys in scopes. + std::string external; +}; + +struct Argument { + std::string step_net; + std::string step_scopes; + std::vector inlinks; + std::vector outlinks; + std::vector memories; +}; + +struct ArgumentName { + std::string step_net; + std::string step_scopes; + std::string inlinks; + std::string outlinks; + std::string inlink_alias; // the alias of inlinks in step net. + std::string outlink_alias; // the alias of outlinks in step net. + std::string memories; // the memory name + std::string pre_memories; // the previous memory name + std::string boot_memories; // the boot memory name +}; + +/** + * Prepare inputs for each step net. + */ +void SegmentInputs(std::vector>& step_scopes, + const std::vector& inlinks, + const size_t seq_len); + +/** + * Process outputs of step nets and merge to variables. + */ +void ConcatOutputs(std::vector>& step_scopes, + const std::vector& outlinks, + const size_t seq_len); + +void LinkMemories(std::vector>& step_scopes, + const std::vector& memories, + size_t step_id, + int offset); + +void InitArgument(const ArgumentName& name, Argument* arg); + +}; // namespace rnn + +// The sequence format in RecurrentOp is Tensor now. +// TODO: +// 1. No-padding computing for sequences with indifinite length in one batch. +// 2. Hierarchical RNN for sequence with sub-sequence. +// 3. Internal Memory. +// 4. More Complex RNN architecture, such as Gated Feedback RNN. +// Refer to: https://arxiv.org/pdf/1502.02367.pdf + +class RecurrentAlgorithm { +public: + void Run(const std::shared_ptr& scope, + const platform::DeviceContext& dev_ctx) const; + + void Init(std::unique_ptr arg) { arg_ = std::move(arg); } + + /** + * InferShape must be called before Run. + */ + void InferShape(const std::shared_ptr& scope) const; + +protected: + /* + * The step scopes will be stored in the father scope as a variable. + * + * NOTE the scopes are reused in both the forward and backward, so just + * create once and expand its size if more steps need. + */ + void CreateScopes(std::shared_ptr scope) const; + + inline const std::vector>& GetStepScopes( + std::shared_ptr scope) const { + return *(scope->GetVariable(arg_->step_scopes)) + ->GetMutable>>(); + } + + void InitMemories(std::shared_ptr step_scopes) const; + +private: + std::unique_ptr arg_; + mutable size_t seq_len_; +}; + +class RecurrentGradientAlgorithm { + /** + * RNN's backward alogorithm. + * + * To accelerate the development of RecurrentGradientOp, we decouple RNN's + * algorithm and `OperatorBase`'s implementation, the former contains the core + * implementation of a RNN, and will keep stable even if the framework changes + * a + * lot, and the latter is a wrapper acts like an dapter for it to make RNN an + * operator. + */ +public: + void Init(std::unique_ptr arg) { arg_ = std::move(arg); } + + void Run(const std::shared_ptr& scope, + const platform::DeviceContext& dev_ctx) const; + + void LinkBootMemoryGradients(std::shared_ptr step_scopes) const; + + /** + * InferShape must be called before Run. + */ + void InferShape(const std::shared_ptr& scope) const; + +protected: + inline const std::vector>& GetStepScopes( + std::shared_ptr scope) const { + return *(scope->GetVariable(arg_->step_scopes)) + ->GetMutable>>(); + } + +private: + std::unique_ptr arg_; + mutable size_t seq_len_; +}; + +class RecurrentOp final : public OperatorBase { +public: + void Init() override; + + /** + * InferShape must be called before Run. + */ + virtual void InferShape(const std::shared_ptr& scope) const override { + alg_.InferShape(scope); + } + + virtual void Run(const std::shared_ptr& scope, + const platform::DeviceContext& dev_ctx) const override { + alg_.Run(scope, dev_ctx); + } + + static const rnn::ArgumentName kArgName; + +private: + RecurrentAlgorithm alg_; +}; + +class RecurrentGradientOp final : public OperatorBase { +public: + void Init() override; + + /** + * InferShape must be called before Run. + */ + virtual void InferShape(const std::shared_ptr& scope) const override { + alg_.InferShape(scope); + } + + virtual void Run(const std::shared_ptr& scope, + const platform::DeviceContext& dev_ctx) const override { + alg_.Run(scope, dev_ctx); + } + + static const rnn::ArgumentName kArgName; + +private: + RecurrentGradientAlgorithm alg_; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/recurrent_network_op_test.cc b/paddle/operators/recurrent_network_op_test.cc new file mode 100644 index 0000000000..6784ac6001 --- /dev/null +++ b/paddle/operators/recurrent_network_op_test.cc @@ -0,0 +1,400 @@ +/* + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include + +#include "paddle/framework/net.h" +#include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" +#include "paddle/framework/tensor.h" +#include "paddle/operators/recurrent_network_op.h" + +namespace paddle { +namespace operators { + +class RecurrentOpTest : public ::testing::Test { +protected: + virtual void SetUp() override { + CreateGlobalVariables(); + CreateStepNet(); + CreateRNNOp(); + } + + virtual void TearDown() override {} + + void CreateGlobalVariables() { + scope_ = std::make_shared(); + // create input, and init content + LOG(INFO) << "create global variable x"; + for (auto inlink : std::vector{"x", "x0", "x1", "h"}) { + Variable* x = scope_->CreateVariable(inlink); + DDim dims = make_ddim(std::vector{ + 10 /*sent size*/, 20 /*batch size*/, 30 /*input dim*/}); + x->GetMutable()->mutable_data(dims, platform::CPUPlace()); + } + // create output alias just for test + for (auto inlink : std::vector{"h@alias"}) { + Variable* x = scope_->CreateVariable(inlink); + DDim dims = + make_ddim(std::vector{20 /*batch size*/, 30 /*input dim*/}); + x->GetMutable()->mutable_data(dims, platform::CPUPlace()); + } + + LOG(INFO) << "create global variable w"; + Variable* w = scope_->CreateVariable("rnn/w"); + w->GetMutable()->mutable_data( + make_ddim(std::vector{30, 30}), platform::CPUPlace()); + + for (auto boot : std::vector{"x_boot", "h_boot"}) { + LOG(INFO) << "create global variable " << boot; + Variable* h_boot = scope_->CreateVariable(boot); + h_boot->GetMutable()->mutable_data( + make_ddim(std::vector{20 /*batch size*/, 30 /*input dim*/}), + platform::CPUPlace()); + } + + LOG(INFO) << "create variable step_scopes"; + scope_->CreateVariable("step_scopes"); + + LOG(INFO) << "create variable h"; + scope_->CreateVariable("h"); + } + + void CreateRNNOp() { + OpDesc op_desc; + + op_desc.set_type("recurrent_op"); + // inlinks 0 + op_desc.add_inputs("x"); + op_desc.add_inputs("x0"); + op_desc.add_inputs("x1"); + // boot_memories 3 + op_desc.add_inputs("x_boot"); + op_desc.add_inputs("h_boot"); + // step net 5 + op_desc.add_inputs("step_net"); + // outlinks 6 + op_desc.add_outputs("h"); + // step scopes 7 + op_desc.add_outputs("step_scopes"); + + auto _input_format = std::vector{ + 0, // in_link + 3, // memories + 5 // step_net + }; + auto input_format = op_desc.add_attrs(); + input_format->set_name("input_format"); + input_format->set_type(paddle::framework::AttrType::INTS); + for (auto i : _input_format) { + input_format->add_ints(i); + } + + auto output_format = op_desc.add_attrs(); + output_format->set_name("output_format"); + output_format->set_type(paddle::framework::AttrType::INTS); + for (auto i : std::vector{0, 1, 2}) { + output_format->add_ints(i); + } + + auto inlink_alias = op_desc.add_attrs(); + inlink_alias->set_name("inlink_alias"); + inlink_alias->set_type(paddle::framework::AttrType::STRINGS); + + auto outlink_alias = op_desc.add_attrs(); + outlink_alias->set_name("outlink_alias"); + outlink_alias->set_type(paddle::framework::AttrType::STRINGS); + + auto pre_memories = op_desc.add_attrs(); + pre_memories->set_name("pre_memories"); + pre_memories->set_type(paddle::framework::AttrType::STRINGS); + + auto memories = op_desc.add_attrs(); + memories->set_name("memories"); + memories->set_type(paddle::framework::AttrType::STRINGS); + + // create inlink_alias + for (const auto& item : + std::vector{"x@alias", "x0@alias", "x1@alias"}) { + inlink_alias->add_strings(item); + } + // pre memories + for (const auto& item : + std::vector{"rnn/x@pre", "rnn/h@pre"}) { + pre_memories->add_strings(item); + } + // memories + for (const auto& item : std::vector{"rnn/x", "rnn/h"}) { + memories->add_strings(item); + } + // output alias + for (const auto& item : std::vector{"h@alias"}) { + outlink_alias->add_strings(item); + } + + rnn_op_ = OpRegistry::CreateOp(op_desc); + + LOG(INFO) << "rnn_op finish init"; + } + + void CreateStepNet() { + LOG(INFO) << "create variable step_net"; + Variable* var = scope_->CreateVariable("step_net"); + auto net = var->GetMutable(); + // rnn/s is net's input or output? + net->inputs_ = {"rnn/h@pre", "rnn/w", "rnn/x"}; + net->inputs_ = {"rnn/s", "rnn/h"}; + net->AddOp( + OpRegistry::CreateOp("mul", {"rnn/h@pre", "rnn/w"}, {"rnn/s"}, {})); + + net->AddOp( + OpRegistry::CreateOp("add_two", {"rnn/x", "rnn/s"}, {"rnn/h"}, {})); + net->CompleteAddOp(); + } + + // father scope + std::shared_ptr scope_; + std::shared_ptr rnn_op_; +}; + +TEST_F(RecurrentOpTest, Run) { + platform::CPUDeviceContext ctx; + rnn_op_->InferShape(scope_); + rnn_op_->Run(scope_, ctx); +} + +class RecurrentGradientAlgorithmTest : public ::testing::Test { +protected: + virtual void SetUp() override { + CreateGlobalVariables(); + CreateStepScopes(); + CreateStepNet(); + CreateRNNGradientAlgorithm(); + + // segment inputs + SegmentInputs(); + // link forward memories + LinkeMemories(); + } + + virtual void TearDown() override {} + + void CreateGlobalVariables() { + scope_ = std::make_shared(); + // inputs: x + LOG(INFO) << "create global variable x"; + Variable* x = scope_->CreateVariable("x"); + DDim dims = + make_ddim({10 /*sent size*/, 20 /*batch size*/, 30 /*input dim*/}); + x->GetMutable()->mutable_data(dims, platform::CPUPlace()); + // inputs: h_boot + LOG(INFO) << "create global variable h_boot"; + Variable* h_boot = scope_->CreateVariable("h_boot"); + h_boot->GetMutable()->mutable_data( + make_ddim({20 /*batch size*/, 30 /*input dim*/}), platform::CPUPlace()); + // inputs: w + LOG(INFO) << "create global variable w"; + Variable* w = scope_->CreateVariable("rnn/w"); + w->GetMutable()->mutable_data(make_ddim({30, 30}), + platform::CPUPlace()); + // inputs: h_grad + LOG(INFO) << "create variable h_grad"; + Variable* dh = scope_->CreateVariable("h_grad"); + dh->GetMutable()->mutable_data(make_ddim({10, 20, 30}), + platform::CPUPlace()); + // inputs: step_scopes + LOG(INFO) << "create variable step_scopes"; + scope_->CreateVariable("step_scopes"); + // inputs: step_net + LOG(INFO) << "create variable step_net"; + scope_->CreateVariable("step_net"); + // outputs: w_grad + LOG(INFO) << "create global variable w_grad"; + scope_->CreateVariable("rnn/w_grad"); + // outputs: x_grad + LOG(INFO) << "create global variable x_grad"; + scope_->CreateVariable("x_grad"); + // outputs: h_boot_grad + LOG(INFO) << "create global variable h_boot_grad"; + scope_->CreateVariable("h_boot_grad"); + } + + void CreateStepScopes() { + std::vector>* step_scopes = + scope_->GetVariable("step_scopes") + ->GetMutable>>(); + for (int i = 0; i < 10; ++i) { + auto scope = std::make_shared(scope_); + auto pre_t = scope->CreateVariable("rnn/pre_h")->GetMutable(); + pre_t->mutable_data(make_ddim({20, 30}), platform::CPUPlace()); + auto tensor = scope->CreateVariable("rnn/h")->GetMutable(); + tensor->mutable_data(make_ddim({20, 30}), platform::CPUPlace()); + + // for unit test of ConcatOutputs + auto xg = scope->CreateVariable("rnn/x_grad")->GetMutable(); + xg->mutable_data(make_ddim({20, 30}), platform::CPUPlace()); + + step_scopes->push_back(scope); + } + + // last time step + auto g = (*step_scopes)[9] + ->CreateVariable("rnn/h_pre_grad") + ->GetMutable(); + g->mutable_data(make_ddim({20, 30}), platform::CPUPlace()); + } + + void CreateRNNGradientAlgorithm() { + std::unique_ptr arg(new rnn::Argument()); + arg->step_net = "step_net"; + arg->step_scopes = "step_scopes"; + rnn::Link inlink; + inlink.external = "h_grad"; + inlink.internal = "rnn/h_grad"; + arg->inlinks = std::vector{inlink}; + + rnn::Link outlink; + outlink.external = "x_grad"; + outlink.internal = "rnn/x_grad"; + arg->outlinks = std::vector{outlink}; + + rnn::MemoryAttr mem_attr; + mem_attr.pre_var = "rnn/h_pre_grad"; + mem_attr.var = "rnn/h_grad"; + mem_attr.boot_var = "h_boot_grad"; + arg->memories = std::vector{mem_attr}; + + rnn_grad_algo_.Init(std::move(arg)); + } + + void CreateStepNet() { + LOG(INFO) << "create variable step_net"; + Variable* var = scope_->CreateVariable("step_net"); + auto net = var->GetMutable(); + net->AddOp(OpRegistry::CreateOp("mul", + {"rnn/h_pre", "rnn/w", "rnn/s_grad"}, + {"rnn/h_pre_grad", "rnn/w_grad"}, + {})); + + net->AddOp(OpRegistry::CreateOp( + "add_two", {"rnn/h_grad"}, {"rnn/x_grad", "rnn/s_grad"}, {})); + net->CompleteAddOp(); + } + + void SegmentInputs() { + LOG(INFO) << "segment inputs"; + std::vector inlinks = {"x"}; + std::vector inlinks_alias = {"rnn/x"}; + + rnn::Link inlink; + inlink.external = "x"; + inlink.internal = "rnn/x"; + std::vector>* step_scopes = + scope_->GetVariable("step_scopes") + ->GetMutable>>(); + rnn::SegmentInputs(*step_scopes, std::vector{inlink}, 10); + } + + void LinkeMemories() { + LOG(INFO) << "link memories"; + rnn::MemoryAttr mem_attr; + mem_attr.pre_var = "rnn/h_pre"; + mem_attr.var = "rnn/h"; + mem_attr.boot_var = "boot_h"; + std::vector memories; + memories.push_back(mem_attr); + std::vector>* step_scopes = + scope_->GetVariable("step_scopes") + ->GetMutable>>(); + for (int i = 1; i < 10; ++i) { + rnn::LinkMemories(*step_scopes, memories, i, -1); + } + } + + std::shared_ptr scope_; + RecurrentGradientAlgorithm rnn_grad_algo_; +}; + +// TEST_F(RecurrentGradientAlgorithmTest, Run) { +// platform::CPUDeviceContext ctx; +// rnn_grad_algo_.Run(scope_, ctx); +// } + +} // namespace operators +} // namespace paddle + +TEST(RecurrentOp, LinkMemories) { + using namespace paddle::framework; + using namespace paddle::platform; + using namespace paddle::operators; + + // create and init step scopes + int len = 10; + std::vector> step_scopes; + for (int i = 0; i < len; ++i) { + auto scope = std::make_shared(); + scope->CreateVariable("pre_h"); + auto tensor = scope->CreateVariable("h")->GetMutable(); + float* data = tensor->mutable_data(make_ddim({15, 20}), CPUPlace()); + for (int i = 0; i < 15 * 20; ++i) { + data[i] = rand() * (1. / (double)RAND_MAX); + } + step_scopes.push_back(scope); + } + + // create MemoryAttr + rnn::MemoryAttr mem_attr; + mem_attr.pre_var = "pre_h"; + mem_attr.var = "h"; + mem_attr.boot_var = "boot_h"; + std::vector memories; + memories.push_back(mem_attr); + + for (int i = 1; i < len; ++i) { + rnn::LinkMemories(step_scopes, memories, i, -1); + } + // check + for (int i = 0; i < len - 1; ++i) { + const float* a = + step_scopes[i]->GetVariable("h")->GetMutable()->data(); + const float* b = step_scopes[i + 1] + ->GetVariable("pre_h") + ->GetMutable() + ->data(); + for (size_t i = 0; i < 15 * 20; ++i) { + ASSERT_FLOAT_EQ(a[i], b[i]); + } + } + + for (int i = len - 2; i >= 0; --i) { + rnn::LinkMemories(step_scopes, memories, i, 1); + } + // check + for (int i = len - 2; i >= 0; --i) { + const float* a = step_scopes[i] + ->GetVariable("pre_h") + ->GetMutable() + ->data(); + const float* b = step_scopes[i + 1] + ->GetVariable("h") + ->GetMutable() + ->data(); + for (size_t i = 0; i < 15 * 20; ++i) { + ASSERT_FLOAT_EQ(a[i], b[i]); + } + } +} + +USE_OP(add_two); +USE_OP(mul); diff --git a/paddle/operators/rnn_design.md b/paddle/operators/rnn_design.md new file mode 100644 index 0000000000..3d38b9a0ad --- /dev/null +++ b/paddle/operators/rnn_design.md @@ -0,0 +1,239 @@ +# RNN 变长输入设计 +对变长序列的学习,现有主流框架比如 tensorflow, pytorch, caffe2, mxnet 等均使用了padding的方式, +即将一个mini-batch内不同长度的序列补0到固定长度参与计算。 + +现有Paddle包括 `RecurrentLayerGroup` 在内的RNN均实现了无padding的变长序列支持,本文也将基于该模块的思路,设计重构后的变长序列支持。 + +## 背景介绍 +由于tensor必须有明确的shape,因此基于tensor 的主流框架在存储变长序列时, +必须用zero-padding的方式将变长序列补全为固定shape的tensor。 + +由于padding是一种框架实现变长序列的妥协, 从用户角度,在使用RNN类模型时自然会比较介意padding的存在, +因此会有pytorch中对非padding方式变长序列支持长篇的讨论[3]。 + +由于padding对内存和计算会有额外的消耗,tensorflow和mxnet均使用了bucketing来进行优化[1][2], +但不管是padding还是bucket,对于用户都是额外的使用负担。 + +因此,**paddle原生支持变长序列的方式,能直接满足用户对变长序列的最直接的需求,在当前主流平台中可以算是一大优势**。 + +但对变长序列的支持,需要对目前框架做一些修改,下面讨论如何在最小修改下支持变长序列。 + +## 多层序列数据格式 `LODTensor` +目前 Paddle 会将一个mini-batch内的数据存储在一维的内存上, +额外使用 `Argument.sequenceStartPositions` 来存储每个句子的信息。 + +Paddle里使用 `Argument.subSequenceStartPositions` 来存储2层的序列信息,更高维度的序列则无法直接支持; + +为了支持 `N-level` 序列的存储,本文将序列信息定义成如下数据结构: + +```c++ +std::shared_ptr>> lod_start_pos_; +``` + +或者更明确的定义 + +```c++ +typedef std::vector level_t; +std::vector lod_start_pos; +``` + +这里的每一个 `level_t` 存储一个粒度(level)的偏移信息,和paddle目前做法一致。 + +为了更透明地传递序列信息,我们引入了一种新的tensor 称为 `LODTensor`[4], +其关于tensor相关的接口都直接继承自 `Tensor`,但另外添加了序列相关接口。 +如此,在操作一个 `LODTensor` 时,普通 `Op` 直接当成 `Tensor` 使用, +而操作序列的 `Op` 会额外操作 `LODTensor` 的变长序列操作的相关接口。 + +`LODTensor` 具体定义如下: + +```c++ +class LODTensor : public Tensor { +public: + size_t Levels() const { return seq_start_positions_.size(); } + size_t Elements(int level = 0) const { + return seq_start_positions_[level].size(); + } + // slice of level[elem_begin: elem_end] + // NOTE low performance in slice seq_start_positions_. + // TODO should call Tensor's Slice. + LODTensor LODSlice(int level, int elem_begin, int elem_end) const; + + // slice with tensor's data shared with this. + LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const; + + // copy other's lod_start_pos_, to share LOD info. + // NOTE the LOD info sould not be changed. + void ShareConstLODFrom(const LODTensor &other) { + lod_start_pos_ = other.lod_start_pos_; + } + // copy other's lod_start_pos_'s content, free to mutate. + void ShareMutableLODFrom(const LODTensor &other) { + lod_start_pos_ = std::make_shared < + std::vector>(other.lod_start_pos_.begin(), + other.lod_start_pos_.end()); + } + +private: + std::shared_ptr>> lod_start_pos_; +}; +``` + +其中, `lod_start_pos_` 使用了 `shared_ptr` 来减少存储和复制的代价, +可以认为 `LODTensor` 是 `Tensor` 的扩展,几乎完全兼容原始 `Tensor` 的使用。 + +## 框架支持 +### 框架现有的 `Tensor` 调用替换为 `LODTensor` +为了实现 `LODTensor` 的传递,框架里很多 `Tensor` 都需要变成 `LODTensor`, +简单实现,直接 **把之前所有的`Tensor` 全部替换成 `LODTensor`,这里可以直接修改 `pybind.cc` 里面创建`Tensor`的接口**。 + +此外,用户有可能需要感知序列的存在(比如序列的可视化需要解析模型中输出的序列),因此一些序列操作的API也需要暴露到 python 层。 + +### `lod_start_pos` 随着Op调用链传递 +框架需要支持下列特性,以实现`lod_start_pos`的传递: + +1. 以 `shared_ptr` 的方式实现传递 + - 不修改 `lod_start_pos` 内容的作为 consumer + - 修改 `lod_start_pos` 的作为 producer + - 约定 consumer 只需要复制传递过来的 `shared_ptr` + - producer 需要创建自己的独立的内存,以存储自己独立的修改,并暴露 `shared_ptr` 给后续 consumer + - 由于传递过程是以复制`shared_ptr`的方式实现,因此框架只需要传递一次 `lod_start_pos` + +2. 对于不感知 `lod_start_pos` 的Op足够透明 +3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据 + +具体的设计分为以下3小节 + +#### `load_start_pos` 的传递 + +- 对于不需要修改 `lod_start_pos` 的情况,调用 LODTensor的 `ShareConstLODFrom` 接口实现复制 +- 需要修改的,调用`ShareMutableLODFrom` 接口自己分配内存以存储修改 + +#### 框架透明 +传递这一步需要加入到网络跑之前的初始化操作中,并且只需要初始化一次,基于当前框架设计的初步方案如下 + +- 在 Op 的 `attrs` 中添加一项 `do_mutate_lod_info` 的属性,默认为 `false` + - 有需要修改 `lod_start_pos` 的Op需要在定义 `OpProto` 时设置为 `true` +- `OperatorBase` 的 `InferShape` 中会读取 `do_mutate_lod_info` ,并且调用 `LODTensor` 相关的方法实现 `lod_start_pos` 的复制。 +- `OperatorBase` 中添加一个 member `is_lod_inited{false}` 来保证传递只进行一次 + +一些逻辑如下 + +```c++ +class OperatorBase { +public: + // ... + void InferShape() { + if (!is_load_inited) { + bool do_mutate_lod_info = GetAttr("do_mutate_load_info"); + // find a input having LOD to copy + auto lod_input = ValidLODInput(); + for (auto &output : outputs) { + if (do_mutate_load_info) { + output.ShareMutableLODFrom(lod_input); + } else { + output.ShareConstLODFrom(load_input); + } + } + is_pod_inited = true; + } + + // call op's InferShape + // ... + } + +private: + // ... + bool is_lod_inited{false}; +}; +``` + +如此,`lod_start_pos` 的信息的传递对非OLD的Op的实现是完全透明的。 + +#### `lod_start_pos` 的更新 +上一小节介绍到,对于需要修改 `load_start_pos` 的Op,`OperatorBase` 会分配一块自己的内存以存储修改, +Op在 `Run` 的实现中,操作更新自己的 `load_start_pos` , +而所有依赖其 outputs 的 op 会通过共享的指针自动获取到其更新。 + +## 根据长度排序 +按照长度排序后,从前往后的时间步的batch size会自然地递减,可以直接塞入 Net 做batch计算 + +比如原始的输入: + +``` +origin: +xxxx +xx +xxx + +-> sorted: +xxxx +xxx +xx +``` + +经过 `SegmentInputs` 之后,每个会有4个时间步,每个时间步的输入如下(纵向排列) + +``` +0 1 2 3 +x x x x +x x x +x x +``` + +为了追踪排序前后序列的变化,这里用 +```c++ +struct SortedSeqItem { + void *start{nullptr}; + void *end{nullptr}; +}; + +std::vector sorted_seqs; +``` +来追踪序列排序后的位置,并添加一个新的接口 + +```c++ +std::vector SortBySeqLen(const LODTensor& tensor); +``` + +由于输入序列的顺序变化,以下现有的接口需要针对性地修改: + +- InitMemories, memory需要根据 `sorted_seqs` 重新排列 +- SetmentInputs +- ConcatOutputs + +此外,由于 `sorted_seqs` 需要被 `RecurrentGradientOp` 复用,因此会变成 `RecurrentOp` 一个新的output输出, +之后作为 `RecurrentGradientOp` 的一个输入传入。 + +## InitMemories +由于序列顺序的变化,`boot_memories` 的batch上的element的顺序也需要对应重新排列。 + +## SegmentInputs +`SegmentInputs` 会依赖 `sorted_seqs` 的信息,将原始的序列按照排序后的序列顺序,从横向切割,转为每个step中的inputs。 + +即下面的转变: +``` +origin: +xxxx +xx +xxx + + | + | + \ / + ! +0 1 2 3 +x x x x +x x x +x x +``` +## ConcatOutputs +`ConcatOutputs` 需要 + +- 将每个时间步的输出重新还原为原始输入的序列顺序(以防止Infer阶段顺序打乱) +- 将每个序列concat 为规则的mini-batch表示 + +## 参考文献 +1. [Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing) +2. [mxnet Bucketing](http://mxnet.io/how_to/bucketing.html) +3. [variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5) +4. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail) diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc index e04d69fa72..4129422fa7 100644 --- a/paddle/operators/rowwise_add_op.cc +++ b/paddle/operators/rowwise_add_op.cc @@ -13,15 +13,13 @@ limitations under the License. */ #include "paddle/operators/rowwise_add_op.h" -#include "paddle/framework/op_registry.h" namespace paddle { namespace operators { -class RowWiseAddOp : public framework::OperatorWithKernel { +class RowWiseAddOp : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override { + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override { PADDLE_ENFORCE(inputs.size() == 2UL, "Two inputs is needed by rowwise add"); auto dim0 = inputs[0]->dims(); auto dim1 = inputs[1]->dims(); @@ -34,11 +32,10 @@ protected: } }; -class RowWiseAddOpMaker : public framework::OpProtoAndCheckerMaker { +class RowWiseAddOpMaker : public OpProtoAndCheckerMaker { public: - RowWiseAddOpMaker(framework::OpProto *proto, - framework::OpAttrChecker *op_checker) - : framework::OpProtoAndCheckerMaker(proto, op_checker) { + RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The left input of row-wise add op, must be matrix"); AddInput("b", "The right input of row-wise add op, must be vector"); AddOutput("Out", "The output of row-wise add op"); @@ -53,9 +50,6 @@ for i in xrange(X.shape[0]): } // namespace operators } // namespace paddle -REGISTER_OP(rowwise_add, - paddle::operators::RowWiseAddOp, - paddle::operators::RowWiseAddOpMaker); -REGISTER_OP_CPU_KERNEL( - rowwise_add, - paddle::operators::RowWiseAddKernel); +REGISTER_OP(rowwise_add, ops::RowWiseAddOp, ops::RowWiseAddOpMaker); +REGISTER_OP_CPU_KERNEL(rowwise_add, + ops::RowWiseAddKernel); diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu index 5dfac4fd2c..4b33e38eba 100644 --- a/paddle/operators/rowwise_add_op.cu +++ b/paddle/operators/rowwise_add_op.cu @@ -1,6 +1,4 @@ -#include "paddle/framework/op_registry.h" #include "paddle/operators/rowwise_add_op.h" -REGISTER_OP_GPU_KERNEL( - rowwise_add, - paddle::operators::RowWiseAddKernel); +REGISTER_OP_GPU_KERNEL(rowwise_add, + ops::RowWiseAddKernel); diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h index dc47fe7c84..4596925e93 100644 --- a/paddle/operators/rowwise_add_op.h +++ b/paddle/operators/rowwise_add_op.h @@ -13,25 +13,23 @@ limitations under the License. */ #pragma once -#include "glog/logging.h" -#include "paddle/framework/eigen.h" -#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace operators { template -class RowWiseAddKernel : public framework::OpKernel { +class RowWiseAddKernel : public OpKernel { public: - void Compute(const framework::KernelContext& context) const override { - auto in0 = context.Input(0)->Get(); - auto in1 = context.Input(1)->Get(); - auto* out = context.Output(0)->GetMutable(); + void Compute(const KernelContext& context) const override { + auto in0 = context.Input(0)->Get(); + auto in1 = context.Input(1)->Get(); + auto* out = context.Output(0)->GetMutable(); out->mutable_data(context.GetPlace()); - auto input = framework::EigenMatrix::From(in0); - auto bias = framework::EigenVector::From(in1); - auto output = framework::EigenMatrix::From(*out); + auto input = EigenMatrix::From(in0); + auto bias = EigenVector::From(in1); + auto output = EigenMatrix::From(*out); const int bias_size = bias.dimension(0); const int rest_size = input.size() / bias_size; diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc index 66ab1e0011..f6c654a9e7 100644 --- a/paddle/operators/sgd_op.cc +++ b/paddle/operators/sgd_op.cc @@ -13,17 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/sgd_op.h" -#include "paddle/framework/op_registry.h" -#include "paddle/framework/tensor.h" namespace paddle { namespace operators { -class SGDOp : public framework::OperatorWithKernel { +class SGDOp : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override { + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override { PADDLE_ENFORCE(inputs.size() == 2, "Input size of SGDOp must be two"); PADDLE_ENFORCE(outputs.size() == 1, "Output size of SGDOp must be one"); PADDLE_ENFORCE(inputs[0] != nullptr, "inputs[0] mast be set"); @@ -35,10 +32,10 @@ protected: } }; -class SGDOpMaker : public framework::OpProtoAndCheckerMaker { +class SGDOpMaker : public OpProtoAndCheckerMaker { public: - SGDOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) - : framework::OpProtoAndCheckerMaker(proto, op_checker) { + SGDOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("param", "input parameter"); AddInput("grad", "input gradient"); AddOutput("param_out", "output parameter"); @@ -55,7 +52,5 @@ param_out = param - learning_rate * grad; } // namespace operators } // namespace paddle -REGISTER_OP(sgd, paddle::operators::SGDOp, paddle::operators::SGDOpMaker); -typedef paddle::operators::SGDOpKernel<::paddle::platform::CPUPlace, float> - SGDOpKernel_CPU_float; -REGISTER_OP_CPU_KERNEL(sgd, SGDOpKernel_CPU_float); +REGISTER_OP(sgd, ops::SGDOp, ops::SGDOpMaker); +REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel); diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu index 400425db10..f8f5b90cab 100644 --- a/paddle/operators/sgd_op.cu +++ b/paddle/operators/sgd_op.cu @@ -1,5 +1,3 @@ #include "paddle/operators/sgd_op.h" -#include "paddle/framework/op_registry.h" -typedef paddle::operators::SGDOpKernel<::paddle::platform::GPUPlace, float> SGDOpKernel_GPU_float; -REGISTER_OP_GPU_KERNEL(sgd, SGDOpKernel_GPU_float); \ No newline at end of file +REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel); \ No newline at end of file diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h index 4b2d214618..65179d323b 100644 --- a/paddle/operators/sgd_op.h +++ b/paddle/operators/sgd_op.h @@ -13,28 +13,24 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "glog/logging.h" -#include "paddle/framework/eigen.h" -#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace operators { template -class SGDOpKernel : public framework::OpKernel { +class SGDOpKernel : public OpKernel { public: - void Compute(const framework::KernelContext& ctx) const override { - auto param = ctx.Input("param")->Get(); - auto grad = ctx.Input("grad")->Get(); - auto* param_out = ctx.Output(0)->GetMutable(); + void Compute(const KernelContext& ctx) const override { + auto param = ctx.Input("param")->Get(); + auto grad = ctx.Input("grad")->Get(); + auto* param_out = ctx.Output(0)->GetMutable(); float lr = ctx.op_.GetAttr("learning_rate"); param_out->mutable_data(ctx.GetPlace()); - framework::EigenVector::Flatten(*param_out) - .device(*(ctx.GetEigenDevice())) = - framework::EigenVector::Flatten(param) - - lr * framework::EigenVector::Flatten(grad); + EigenVector::Flatten(*param_out).device(*(ctx.GetEigenDevice())) = + EigenVector::Flatten(param) - lr * EigenVector::Flatten(grad); } }; diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc index bf63af28b0..716f1d9c4d 100644 --- a/paddle/operators/sigmoid_op.cc +++ b/paddle/operators/sigmoid_op.cc @@ -13,37 +13,33 @@ limitations under the License. */ #include "paddle/operators/sigmoid_op.h" -#include "paddle/framework/op_registry.h" namespace paddle { namespace operators { -class SigmoidOp : public framework::OperatorWithKernel { +class SigmoidOp : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override { + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override { PADDLE_ENFORCE(inputs.size() == 1, "Sigmoid Op only have one input"); PADDLE_ENFORCE(outputs.size() == 1, "Sigmoid Op only have one output"); outputs[0]->Resize(inputs[0]->dims()); } }; -class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker { +class SigmoidOpMaker : public OpProtoAndCheckerMaker { public: - SigmoidOpMaker(framework::OpProto *proto, - framework::OpAttrChecker *op_checker) - : framework::OpProtoAndCheckerMaker(proto, op_checker) { + SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "sigmoid input"); AddOutput("Y", "sigmoid output"); AddComment("Sigmoid function"); } }; -class SigmoidOpGrad : public framework::OperatorWithKernel { +class SigmoidOpGrad : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override {} + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override {} std::string DebugString() const override { LOG(INFO) << "SigmoidGrad"; return ""; @@ -53,11 +49,7 @@ protected: } // namespace operators } // namespace paddle -REGISTER_OP(sigmoid, - paddle::operators::SigmoidOp, - paddle::operators::SigmoidOpMaker); -REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, paddle::operators::SigmoidOpGrad); +REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker); +REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, ops::SigmoidOpGrad); -REGISTER_OP_CPU_KERNEL( - sigmoid, - paddle::operators::SigmoidKernel); +REGISTER_OP_CPU_KERNEL(sigmoid, ops::SigmoidKernel); diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu index ed344b2bfd..f679b20418 100644 --- a/paddle/operators/sigmoid_op.cu +++ b/paddle/operators/sigmoid_op.cu @@ -1,5 +1,3 @@ #include "paddle/operators/sigmoid_op.h" -#include "paddle/framework/op_registry.h" -REGISTER_OP_GPU_KERNEL( - sigmoid, paddle::operators::SigmoidKernel); +REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel); diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h index 2b9356246c..896a6f5d83 100644 --- a/paddle/operators/sigmoid_op.h +++ b/paddle/operators/sigmoid_op.h @@ -14,25 +14,23 @@ #pragma once -#include "glog/logging.h" -#include "paddle/framework/eigen.h" -#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace operators { template -class SigmoidKernel : public framework::OpKernel { +class SigmoidKernel : public OpKernel { public: - void Compute(const framework::KernelContext& context) const override { - auto input = context.Input(0)->Get(); - auto* output = context.Output(0)->GetMutable(); + void Compute(const KernelContext& context) const override { + auto input = context.Input(0)->Get(); + auto* output = context.Output(0)->GetMutable(); output->mutable_data(context.GetPlace()); - framework::EigenVector::Flatten(*output).device( + EigenVector::Flatten(*output).device( *(context.GetEigenDevice())) = - 1.0 / (1.0 + (-1.0 * framework::EigenVector::Flatten(input)).exp()); + 1.0 / (1.0 + (-1.0 * EigenVector::Flatten(input)).exp()); } }; } // namespace operators diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc index 82f72fa19f..df60b62fa6 100644 --- a/paddle/operators/softmax_op.cc +++ b/paddle/operators/softmax_op.cc @@ -12,16 +12,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/softmax_op.h" -#include "paddle/framework/op_registry.h" namespace paddle { namespace operators { -class SoftmaxOp : public framework::OperatorWithKernel { +class SoftmaxOp : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override { + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override { PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax"); PADDLE_ENFORCE(inputs[0]->dims().size() == 2, "The input of softmax op must be matrix"); @@ -31,10 +29,9 @@ protected: } }; -class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { +class SoftmaxOpMaker : public OpProtoAndCheckerMaker { public: - SoftmaxOpMaker(framework::OpProto *proto, - framework::OpAttrChecker *op_checker) + SoftmaxOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "input of softmax"); AddOutput("Y", "output of softmax"); @@ -42,11 +39,10 @@ public: } }; -class SoftmaxOpGrad : public framework::OperatorWithKernel { +class SoftmaxOpGrad : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override {} + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override {} std::string DebugString() const override { LOG(INFO) << "SoftmaxOpGrad"; return ""; @@ -56,9 +52,6 @@ protected: } // namespace operators } // namespace paddle -namespace ops = paddle::operators; - REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker); -REGISTER_GRADIENT_OP(softmax, softmax_grad, paddle::operators::SoftmaxOpGrad); -REGISTER_OP_CPU_KERNEL(softmax, - ops::SoftmaxKernel); +REGISTER_GRADIENT_OP(softmax, softmax_grad, ops::SoftmaxOpGrad); +REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel); diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu index 60676191eb..a1f6944a36 100644 --- a/paddle/operators/softmax_op.cu +++ b/paddle/operators/softmax_op.cu @@ -1,5 +1,4 @@ #include "paddle/framework/op_registry.h" #include "paddle/operators/softmax_op.h" -REGISTER_OP_GPU_KERNEL( - softmax, paddle::operators::SoftmaxKernel); +REGISTER_OP_GPU_KERNEL(softmax, ops::SoftmaxKernel); diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h index 500c188dbf..625a87b585 100644 --- a/paddle/operators/softmax_op.h +++ b/paddle/operators/softmax_op.h @@ -14,23 +14,21 @@ #pragma once -#include "glog/logging.h" -#include "paddle/framework/eigen.h" -#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace operators { template -class SoftmaxKernel : public framework::OpKernel { +class SoftmaxKernel : public OpKernel { public: - void Compute(const framework::KernelContext& context) const override { - auto input = context.Input(0)->Get(); - auto* output = context.Output(0)->GetMutable(); + void Compute(const KernelContext& context) const override { + auto input = context.Input(0)->Get(); + auto* output = context.Output(0)->GetMutable(); output->mutable_data(context.GetPlace()); - auto logits = framework::EigenMatrix::From(input); - auto softmax = framework::EigenMatrix::From(*output); + auto logits = EigenMatrix::From(input); + auto softmax = EigenMatrix::From(*output); const int kBatchDim = 0; const int kClassDim = 1; diff --git a/paddle/operators/type_alias.h b/paddle/operators/type_alias.h new file mode 100644 index 0000000000..b712e457ff --- /dev/null +++ b/paddle/operators/type_alias.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/framework/eigen.h" +#include "paddle/framework/net.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using OpKernel = framework::OpKernel; +using KernelContext = framework::KernelContext; +template +using EigenVector = framework::EigenVector; +template +using EigenMatrix = framework::EigenMatrix; +template +using EigenTensor = framework::EigenTensor; +using Tensor = framework::Tensor; +using OperatorWithKernel = framework::OperatorWithKernel; +using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker; +using OpProto = framework::OpProto; +using OpAttrChecker = framework::OpAttrChecker; +using CPUPlace = platform::CPUPlace; +using GPUPlace = platform::GPUPlace; +using NetOp = framework::NetOp; +using OpRegistry = framework::OpRegistry; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index 9c1d94e9e7..a928e09778 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -20,12 +20,101 @@ Eigen::DefaultDevice* DeviceContext::get_eigen_device() return reinterpret_cast(this)->eigen_device(); } +CPUDeviceContext::CPUDeviceContext() { + eigen_device_.reset(new Eigen::DefaultDevice()); +} + +CPUDeviceContext::CPUDeviceContext(CPUPlace place) { + eigen_device_.reset(new Eigen::DefaultDevice()); +} + +Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const { + return eigen_device_.get(); +} + +Place CPUDeviceContext::GetPlace() const { return CPUPlace(); } + #ifndef PADDLE_ONLY_CPU + template <> Eigen::GpuDevice* DeviceContext::get_eigen_device() const { return reinterpret_cast(this)->eigen_device(); } -#endif + +CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) { + SetDeviceId(place_.device); + // TODO(qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly + // here will cause segment fault. We must implement a class derived from + // Eigen::StreamInterface, and reinitialize it with a cuda stream and a gpu id + // later. Please refer to the implementation of class EigenCudaStreamDevice + // in TensorFlow. + // + // We find that CUDA 7 introduces a new option, the per-thread default stream, + // that has two effects. Please refer to https://devblogs.nvidia.com/ + // parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/ + // + // So, we decide to use default stream and add –default-stream per-thread nvcc + // flag. Than, two threads with two CUDADeviceContexts will run parallelly. + eigen_stream_.reset(new Eigen::CudaStreamDevice()); + eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); +} + +CUDADeviceContext::~CUDADeviceContext() { + SetDeviceId(place_.device); + Wait(); + if (cublas_handle_) { + PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); + } + + if (cudnn_handle_) { + PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); + } + + if (curand_generator_) { + PADDLE_ENFORCE(dynload::curandDestroyGenerator(curand_generator_)); + } + eigen_stream_.reset(); + eigen_device_.reset(); +} + +Place CUDADeviceContext::GetPlace() const { return place_; } + +void CUDADeviceContext::Wait() const { + PADDLE_ENFORCE(cudaStreamSynchronize(0)); +} + +Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { + return eigen_device_.get(); +} + +cublasHandle_t CUDADeviceContext::cublas_handle() { + if (!cublas_handle_) { + SetDeviceId(place_.device); + PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); + } + return cublas_handle_; +} + +cudnnHandle_t CUDADeviceContext::cudnn_handle() { + if (!cudnn_handle_) { + SetDeviceId(place_.device); + PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); + } + return cudnn_handle_; +} + +curandGenerator_t CUDADeviceContext::curand_generator() { + if (!curand_generator_) { + SetDeviceId(place_.device); + PADDLE_ENFORCE(dynload::curandCreateGenerator(&curand_generator_, + CURAND_RNG_PSEUDO_DEFAULT)); + PADDLE_ENFORCE( + dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_)); + } + return curand_generator_; +} + +#endif // PADDLE_ONLY_CPU } // namespace platform } // namespace paddle diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index fe6f13e399..2038fafe2e 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -39,14 +39,13 @@ class DeviceContext { class CPUDeviceContext : public DeviceContext { public: - CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); } + CPUDeviceContext(); + CPUDeviceContext(CPUPlace); + virtual ~CPUDeviceContext() {} - Eigen::DefaultDevice* eigen_device() const { return eigen_device_.get(); } + Eigen::DefaultDevice* eigen_device() const; - Place GetPlace() const override { - Place retv = CPUPlace(); - return retv; - } + Place GetPlace() const override; private: std::unique_ptr eigen_device_; @@ -54,119 +53,46 @@ class CPUDeviceContext : public DeviceContext { #ifndef PADDLE_ONLY_CPU -class GPUPlaceGuard { +class CUDADeviceContext : public DeviceContext { public: - explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) { - if (previous_ != new_place) { - paddle::platform::SetDeviceId(new_place.device); - } - } + explicit CUDADeviceContext(GPUPlace); + virtual ~CUDADeviceContext(); - ~GPUPlaceGuard() { paddle::platform::SetDeviceId(previous_.device); } + /*! \brief Wait for all operations completion in the stream. */ + void Wait() const; - private: - GPUPlace previous_; -}; + /*! \brief Return place in the device context. */ + Place GetPlace() const override; -class CUDADeviceContext : public DeviceContext { - public: - explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) { - GPUPlaceGuard guard(gpu_place_); - PADDLE_ENFORCE(cudaStreamCreate(&stream_), "cudaStreamCreate failed"); - eigen_stream_.reset(new Eigen::CudaStreamDevice(&stream_)); - eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); - } - - Place GetPlace() const override { - Place retv = GPUPlace(); - return retv; - } - - void Wait() { - PADDLE_ENFORCE(cudaStreamSynchronize(stream_), - "cudaStreamSynchronize failed"); - } - - cudaStream_t stream() { return stream_; } - - Eigen::GpuDevice* eigen_device() const { return eigen_device_.get(); } - - cublasHandle_t cublas_handle() { - if (!blas_handle_) { - GPUPlaceGuard guard(gpu_place_); - PADDLE_ENFORCE(paddle::platform::dynload::cublasCreate(&blas_handle_), - "cublasCreate failed"); - PADDLE_ENFORCE( - paddle::platform::dynload::cublasSetStream(blas_handle_, stream_), - "cublasSetStream failed"); - } - return blas_handle_; - } - - cudnnHandle_t cudnn_handle() { - if (!dnn_handle_) { - GPUPlaceGuard guard(gpu_place_); - PADDLE_ENFORCE(paddle::platform::dynload::cudnnCreate(&dnn_handle_), - "cudnnCreate failed"); - PADDLE_ENFORCE( - paddle::platform::dynload::cudnnSetStream(dnn_handle_, stream_), - "cudnnSetStream failed"); - } - return dnn_handle_; - } - - curandGenerator_t curand_generator() { - if (!rand_generator_) { - GPUPlaceGuard guard(gpu_place_); - PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator( - &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT), - "curandCreateGenerator failed"); - PADDLE_ENFORCE( - paddle::platform::dynload::curandSetPseudoRandomGeneratorSeed( - rand_generator_, random_seed_), - "curandSetPseudoRandomGeneratorSeed failed"); - PADDLE_ENFORCE( - paddle::platform::dynload::curandSetStream(rand_generator_, stream_), - "curandSetStream failed"); - } - return rand_generator_; - } - - ~CUDADeviceContext() { - Wait(); - if (blas_handle_) { - PADDLE_ENFORCE(paddle::platform::dynload::cublasDestroy(blas_handle_), - "cublasDestroy failed"); - } - - if (dnn_handle_) { - PADDLE_ENFORCE(paddle::platform::dynload::cudnnDestroy(dnn_handle_), - "cudnnDestroy failed"); - } - - if (rand_generator_) { - PADDLE_ENFORCE( - paddle::platform::dynload::curandDestroyGenerator(rand_generator_), - "curandDestroyGenerator failed"); - } - eigen_stream_.reset(); - eigen_device_.reset(); - PADDLE_ENFORCE(cudaStreamDestroy(stream_), "cudaStreamDestroy failed"); - } + /*! \brief Return eigen device in the device context. */ + Eigen::GpuDevice* eigen_device() const; + + // clang-format off + /*! \brief Return cublas handle in the device context. */ + cublasHandle_t cublas_handle (); + + /*! \brief Return cudnn handle in the device context. */ + cudnnHandle_t cudnn_handle (); + + /*! \brief Return curand handle in the device context. */ + curandGenerator_t curand_generator(); + // clang-format on private: - GPUPlace gpu_place_; - cudaStream_t stream_; + GPUPlace place_; - std::unique_ptr eigen_stream_; + private: std::unique_ptr eigen_device_; + std::unique_ptr eigen_stream_; - cublasHandle_t blas_handle_{nullptr}; - - cudnnHandle_t dnn_handle_{nullptr}; + private: + uint64_t seed_; - int random_seed_; - curandGenerator_t rand_generator_{nullptr}; + // clang-format off + cudnnHandle_t cudnn_handle_ = nullptr; + cublasHandle_t cublas_handle_ = nullptr; + curandGenerator_t curand_generator_ = nullptr; + // clang-format on }; #endif diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h index b06ab8a2f1..fd4adbd9de 100644 --- a/paddle/platform/enforce.h +++ b/paddle/platform/enforce.h @@ -36,6 +36,21 @@ limitations under the License. */ namespace paddle { namespace platform { +struct EnforceNotMet : public std::exception { + std::exception_ptr exp_; + std::string err_str_; + + EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) { + try { + std::rethrow_exception(exp_); + } catch (const std::exception& exp) { + err_str_ = string::Sprintf("%s at [%s:%d]", exp.what(), f, l); + } + } + + const char* what() const noexcept { return err_str_.c_str(); } +}; + // Because most enforce conditions would evaluate to true, we can use // __builtin_expect to instruct the C++ compiler to generate code that // always forces branch prediction of true. @@ -43,18 +58,11 @@ namespace platform { // For more details, please check https://stackoverflow.com/a/43870188/724872. #define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) -template -inline void throw_on_error(T e) { - throw_on_error(e, ""); -} - template inline typename std::enable_if::type throw_on_error( int stat, const Args&... args) { if (UNLIKELY(!(stat))) { - throw std::runtime_error( - string::Sprintf(args...) + - string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); + throw std::runtime_error(string::Sprintf(args...)); } } @@ -64,12 +72,8 @@ template inline typename std::enable_if::type throw_on_error( cudaError_t e, const Args&... args) { if (UNLIKELY(e)) { - // clang-format off - throw thrust::system_error( - e, thrust::cuda_category(), - string::Sprintf(args...) + - string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); - // clang-format on + throw thrust::system_error(e, thrust::cuda_category(), + string::Sprintf(args...)); } } @@ -77,12 +81,8 @@ template inline typename std::enable_if::type throw_on_error( curandStatus_t stat, const Args&... args) { if (stat != CURAND_STATUS_SUCCESS) { - // clang-format off - throw thrust::system_error( - cudaErrorLaunchFailure, thrust::cuda_category(), - string::Sprintf(args...) + - string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); - // clang-format on + throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), + string::Sprintf(args...)); } } @@ -92,12 +92,8 @@ inline typename std::enable_if::type throw_on_error( if (stat == CUDNN_STATUS_SUCCESS) { return; } else { - // clang-format off - throw std::runtime_error( - platform::dynload::cudnnGetErrorString(stat) + - string::Sprintf(args...) + - string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); - // clang-format on + throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + + string::Sprintf(args...)); } } @@ -126,22 +122,32 @@ inline typename std::enable_if::type throw_on_error( } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) { err = "CUBLAS: license error, "; } - throw std::runtime_error(err + string::Sprintf(args...) + - string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); + throw std::runtime_error(err + string::Sprintf(args...)); } #endif // PADDLE_ONLY_CPU -#define PADDLE_THROW(...) \ - do { \ - throw std::runtime_error( \ - string::Sprintf(__VA_ARGS__) + \ - string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); \ +template +inline void throw_on_error(T e) { + throw_on_error(e, ""); +} + +#define PADDLE_THROW(...) \ + do { \ + throw ::paddle::platform::EnforceNotMet( \ + std::make_exception_ptr( \ + std::runtime_error(string::Sprintf(__VA_ARGS__))), \ + __FILE__, __LINE__); \ } while (0) -#define PADDLE_ENFORCE(...) \ - do { \ - ::paddle::platform::throw_on_error(__VA_ARGS__); \ +#define PADDLE_ENFORCE(...) \ + do { \ + try { \ + ::paddle::platform::throw_on_error(__VA_ARGS__); \ + } catch (...) { \ + throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ + __FILE__, __LINE__); \ + } \ } while (0) } // namespace platform diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc index d7152f8150..2ac31812a8 100644 --- a/paddle/platform/enforce_test.cc +++ b/paddle/platform/enforce_test.cc @@ -23,7 +23,7 @@ TEST(ENFORCE, FAILED) { bool in_catch = false; try { PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123); - } catch (const std::runtime_error& error) { + } catch (paddle::platform::EnforceNotMet error) { // your error handling code here in_catch = true; std::string msg = "Enforce is not ok 123 at all"; diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt index fd1a142b40..7d0e68a8f3 100644 --- a/paddle/pybind/CMakeLists.txt +++ b/paddle/pybind/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python - add_op fc_op sgd_op cross_entropy_op) + add_op fc_op sgd_op cross_entropy_op recurrent_network_op) diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 1229451523..7ef62c27c3 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -38,6 +38,7 @@ USE_OP(mul); USE_OP(sigmoid); USE_OP(softmax); USE_OP(rowwise_add); +USE_OP_WITHOUT_KERNEL(recurrent_op); template void ExposeOperator(ClassType& m) { @@ -50,6 +51,11 @@ void ExposeOperator(ClassType& m) { .def("__str__", &ClassType::type::DebugString); } +static size_t UniqueIntegerGenerator() { + static std::atomic generator; + return generator.fetch_add(1); +} + PYBIND11_PLUGIN(core) { py::module m("core", "C++ core of PaddlePaddle"); @@ -103,6 +109,11 @@ All parameter, weight, gradient are variables in Paddle. [](pd::Variable& self) -> pd::Tensor* { return self.GetMutable(); }, + py::return_value_policy::reference) + .def("get_net", + [](pd::Variable& self) -> pd::NetOp* { + return self.GetMutable(); + }, py::return_value_policy::reference); py::class_>(m, "Scope") @@ -112,7 +123,8 @@ All parameter, weight, gradient are variables in Paddle. py::return_value_policy::reference) .def("create_var", &pd::Scope::CreateVariable, - py::return_value_policy::reference); + py::return_value_policy::reference) + .def("get_var_name", &pd::Scope::GetVariableName); //! @note: Be careful! PyBind will return std::string as an unicode, not //! Python str. If you want a str object, you should cast them in Python. @@ -144,8 +156,8 @@ All parameter, weight, gradient are variables in Paddle. .def_static("gpu_context", [](paddle::platform::GPUPlace& place) -> paddle::platform::DeviceContext* { - return new paddle::platform::CUDADeviceContext(place); - }) + return new paddle::platform::CUDADeviceContext(place); + }) #endif ; // NOLINT py::class_(m, "GPUPlace").def(py::init()); @@ -166,24 +178,25 @@ All parameter, weight, gradient are variables in Paddle. }); ExposeOperator(operator_base); - using PlainNetPtr = std::shared_ptr; - py::class_ plain_net(m, "PlainNet"); + py::class_> net(m, "Net"); - plain_net - .def_static("create", - []() -> std::shared_ptr { - auto retv = std::make_shared(); - retv->type_ = "plain_net"; - return retv; - }) - .def("add_op", &pd::PlainNet::AddOp) + net.def_static("create", + []() -> std::shared_ptr { + auto retv = std::make_shared(); + retv->type_ = "plain_net"; + return retv; + }) + .def("add_op", &pd::NetOp::AddOp) .def("add_op", - [](PlainNetPtr& self, const PlainNetPtr& plain_net) -> void { - self->AddOp(std::static_pointer_cast(plain_net)); + [](pd::NetOp& self, const std::shared_ptr& net) -> void { + self.AddOp(std::static_pointer_cast(net)); }) - .def("complete_add_op", &pd::PlainNet::CompleteAddOp) - .def("complete_add_op", [](PlainNetPtr& self) { self->CompleteAddOp(); }); - ExposeOperator(plain_net); + .def("complete_add_op", &pd::NetOp::CompleteAddOp) + .def("complete_add_op", + [](std::shared_ptr& self) { self->CompleteAddOp(); }); + ExposeOperator(net); + + m.def("unique_integer", UniqueIntegerGenerator); return m.ptr(); } diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp index a830ceba57..e1558e3fdf 100644 --- a/paddle/trainer/NewRemoteParameterUpdater.cpp +++ b/paddle/trainer/NewRemoteParameterUpdater.cpp @@ -76,7 +76,11 @@ void NewRemoteParameterUpdater::init( sgdConfigV2->set_decay(paramConfig.decay_rate()); optimizeConfigV2.set_lr_policy(paddle::OptimizerConfig::Const); auto constlr = optimizeConfigV2.mutable_const_lr(); - constlr->set_learning_rate(paramConfig.learning_rate()); + if (paramConfig.has_learning_rate()) { + constlr->set_learning_rate(paramConfig.learning_rate()); + } else { + constlr->set_learning_rate(trainerConfig_.learning_rate()); + } if (trainerConfig_.algorithm() == "sgd") { optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD); // FIXME: config all algorithms diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h index 27ddaab3f0..7cde983060 100644 --- a/paddle/utils/Error.h +++ b/paddle/utils/Error.h @@ -126,9 +126,11 @@ public: } /** - * @brief operator bool, return True if there is something error. + * @brief check this status by glog. + * @note It is a temp method used during cleaning Paddle code. It will be + * removed later. */ - operator bool() const { return !this->isOK(); } + void check() const { CHECK(this->isOK()) << msg(); } /** * @brief isOK return True if there is no error. @@ -136,13 +138,6 @@ public: */ bool isOK() const { return msg_ == nullptr; } - /** - * @brief check this status by glog. - * @note It is a temp method used during cleaning Paddle code. It will be - * removed later. - */ - void check() const { CHECK(this->isOK()) << msg(); } - private: std::shared_ptr msg_; }; diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/utils/tests/test_Error.cpp index fdf326b17a..6f311fa6b8 100644 --- a/paddle/utils/tests/test_Error.cpp +++ b/paddle/utils/tests/test_Error.cpp @@ -18,17 +18,17 @@ limitations under the License. */ TEST(Error, testAll) { paddle::Error error; - ASSERT_FALSE(error); + ASSERT_TRUE(error.isOK()); error = paddle::Error("I'm the error"); - ASSERT_TRUE(error); + ASSERT_FALSE(error.isOK()); ASSERT_STREQ("I'm the error", error.msg()); error = paddle::Error("error2"); - ASSERT_TRUE(error); + ASSERT_FALSE(error.isOK()); ASSERT_STREQ("error2", error.msg()); int i = 3; auto error3 = paddle::Error("error%d", i); - ASSERT_TRUE(error3); + ASSERT_FALSE(error3.isOK()); ASSERT_STREQ("error3", error3.msg()); } diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index fc112f1327..5477158ecb 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2055,8 +2055,7 @@ class BatchNormLayer(LayerBase): # Automatically select cudnn_batch_norm for GPU and batch_norm for CPU. # Also based on cudnn version. use_cudnn = use_gpu and batch_norm_type != "batch_norm" and \ - ((not parallel_nn) or self.config.device > -1) and \ - cudnn_version >= 4007 + ((not parallel_nn) or self.config.device > -1) self.layer_type = "cudnn_batch_norm" if use_cudnn else "batch_norm" super(BatchNormLayer, self).__init__( name, self.layer_type, 0, inputs=inputs, **xargs) diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index 9b9f979bb6..ecba871910 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -272,7 +272,7 @@ class ExtraLayerAttribute(object): for key in self.attr: if not hasattr(self, 'can_%s' % key) or \ not getattr(self, 'can_%s' % key): - raise NotImplementedError("Layer %s cannot support %s" % + raise NotImplementedError("Layer %s does not support %s" % (layer_name, key)) @staticmethod diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 21eba71527..14f072fc55 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -865,7 +865,7 @@ def data_layer(name, size, height=None, width=None, layer_attr=None): @wrap_name_default("embedding") @wrap_param_attr_default() -@layer_support(ERROR_CLIPPING) +@layer_support(ERROR_CLIPPING, DROPOUT) def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None): """ Define a embedding Layer. @@ -1320,7 +1320,7 @@ def pooling_layer(input, @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation()) @wrap_act_default(param_names=["act", 'state_act'], act=TanhActivation()) @wrap_name_default("lstmemory") -@layer_support(DROPOUT) +@layer_support() def lstmemory(input, name=None, size=None, @@ -1429,7 +1429,7 @@ def lstmemory(input, @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation()) @wrap_act_default(param_names=["act"], act=TanhActivation()) @wrap_name_default("gru") -@layer_support(DROPOUT) +@layer_support() def grumemory(input, size=None, name=None, @@ -1793,7 +1793,7 @@ def repeat_layer(input, @wrap_name_default("seqreshape") @wrap_act_default(act=IdentityActivation()) @wrap_bias_attr_default(has_bias=False) -@layer_support() +@layer_support(ERROR_CLIPPING, DROPOUT) def seq_reshape_layer(input, reshape_size, act=None, @@ -2703,7 +2703,7 @@ def img_cmrnorm_layer(input, default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.)) @wrap_act_default(act=ReluActivation()) @wrap_name_default("batch_norm") -@layer_support(DROPOUT) +@layer_support(DROPOUT, ERROR_CLIPPING) def batch_norm_layer(input, act=None, name=None, @@ -2783,15 +2783,6 @@ def batch_norm_layer(input, :return: LayerOutput object. :rtype: LayerOutput """ - if not isinstance(act, ReluActivation): - logger.log(logging.WARN, - "%s is not recommend for batch normalization's activation, " - "maybe the relu is better" % act.name) - - if not isinstance(input.activation, LinearActivation): - logger.log(logging.WARN, - "The activation should be inside batch normalization, the " - "previous layer's activation may be Linear") if num_channels is None: if input.num_filters is not None: @@ -2861,7 +2852,7 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None): @wrap_name_default("addto") @wrap_act_default(act=LinearActivation()) @wrap_bias_attr_default(has_bias=False) -@layer_support(DROPOUT) +@layer_support(DROPOUT, ERROR_CLIPPING) def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None): """ AddtoLayer. @@ -2940,7 +2931,7 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None): @wrap_act_default(act=IdentityActivation()) @wrap_name_default("concat") -@layer_support() +@layer_support(DROPOUT, ERROR_CLIPPING) def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None): """ Concat all input vector into one huge vector. @@ -3024,7 +3015,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None): @wrap_name_default("seqconcat") @wrap_act_default(act=IdentityActivation()) @wrap_bias_attr_default(has_bias=False) -@layer_support() +@layer_support(DROPOUT, ERROR_CLIPPING) def seq_concat_layer(a, b, act=None, name=None, layer_attr=None, bias_attr=None): """ @@ -3177,7 +3168,7 @@ def memory(name, @wrap_act_default(param_names=['state_act'], act=TanhActivation()) @wrap_act_default(act=TanhActivation()) @wrap_name_default('lstm_step') -@layer_support(ERROR_CLIPPING, DROPOUT) +@layer_support() def lstm_step_layer(input, state, size=None, @@ -4480,7 +4471,7 @@ def tensor_layer(a, @wrap_param_attr_default() @wrap_bias_attr_default() @wrap_act_default() -@layer_support() +@layer_support(DROPOUT, ERROR_CLIPPING) def selective_fc_layer(input, size, select=None, @@ -5974,7 +5965,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None): """ The crop layer crops images by offset and shape. User can set crop shape by args 'shape' explicitly or by reference input layer. - + The example usage is: .. code-block:: python diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index 07ab2c9b18..5bea980611 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -34,6 +34,7 @@ import minibatch import plot import image import model +import paddle.trainer.config_parser as cp __all__ = [ 'optimizer', @@ -58,6 +59,8 @@ __all__ = [ 'model', ] +cp.begin_parse() + def init(**kwargs): import py_paddle.swig_paddle as api @@ -73,6 +76,11 @@ def init(**kwargs): for key in args_dict.keys(): args.append('--%s=%s' % (key, str(args_dict[key]))) + if 'use_gpu' in kwargs: + cp.g_command_config_args['use_gpu'] = kwargs['use_gpu'] + assert 'parallel_nn' not in kwargs, ("currently 'parallel_nn' is not " + "supported in v2 APIs.") + api.initPaddle(*args) diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index 645f3cc0dc..111496618d 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -166,55 +166,37 @@ def cluster_files_reader(files_pattern, return reader -def convert(output_path, - reader, - num_shards, - name_prefix, - max_lines_to_shuffle=1000): +def convert(output_path, reader, line_count, name_prefix): import recordio """ Convert data from reader to recordio format files. :param output_path: directory in which output files will be saved. :param reader: a data reader, from which the convert program will read data instances. - :param num_shards: the number of shards that the dataset will be partitioned into. :param name_prefix: the name prefix of generated files. :param max_lines_to_shuffle: the max lines numbers to shuffle before writing. """ - assert num_shards >= 1 - assert max_lines_to_shuffle >= 1 - - def open_writers(): - w = [] - for i in range(0, num_shards): - n = "%s/%s-%05d-of-%05d" % (output_path, name_prefix, i, - num_shards - 1) - w.append(recordio.writer(n)) - - return w - - def close_writers(w): - for i in range(0, num_shards): - w[i].close() + assert line_count >= 1 + indx_f = 0 - def write_data(w, lines): + def write_data(indx_f, lines): random.shuffle(lines) - for i, d in enumerate(lines): + filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f) + writer = recordio.writer(filename) + for l in lines: # FIXME(Yancey1989): # dumps with protocol: pickle.HIGHEST_PROTOCOL - o = pickle.dumps(d) - w[i % num_shards].write(o) + writer.write(cPickle.dumps(l)) + writer.close() - w = open_writers() lines = [] - for i, d in enumerate(reader()): lines.append(d) - if i % max_lines_to_shuffle == 0 and i >= max_lines_to_shuffle: - write_data(w, lines) + if i % line_count == 0 and i >= line_count: + write_data(indx_f, lines) lines = [] + indx_f += 1 continue - write_data(w, lines) - close_writers(w) + write_data(indx_f, lines) diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py index cffb319ad8..b705c9109b 100644 --- a/python/paddle/v2/dataset/mq2007.py +++ b/python/paddle/v2/dataset/mq2007.py @@ -242,9 +242,9 @@ def gen_list(querylist): if not isinstance(querylist, QueryList): querylist = QueryList(querylist) querylist._correct_ranking_() - relevance_score_list = [query.relevance_score for query in querylist] + relevance_score_list = [[query.relevance_score] for query in querylist] feature_vector_list = [query.feature_vector for query in querylist] - yield np.array(relevance_score_list).T, np.array(feature_vector_list) + yield np.array(relevance_score_list), np.array(feature_vector_list) def query_filter(querylists): diff --git a/python/paddle/v2/framework/create_op_creation_methods.py b/python/paddle/v2/framework/create_op_creation_methods.py index 7248c3f52a..b034efffb6 100644 --- a/python/paddle/v2/framework/create_op_creation_methods.py +++ b/python/paddle/v2/framework/create_op_creation_methods.py @@ -220,6 +220,9 @@ def create_op_creation_method(op_proto): __impl__.all_input_args = [var.name for var in op_proto.inputs] __impl__.all_output_args = [var.name for var in op_proto.outputs] __impl__.all_attr_args = [attr.name for attr in op_proto.attrs] + __impl__.all_not_temp_output_args = [ + var.name for var in op_proto.outputs if not var.temporary + ] return __impl__ diff --git a/python/paddle/v2/framework/network.py b/python/paddle/v2/framework/network.py new file mode 100644 index 0000000000..c85e87413e --- /dev/null +++ b/python/paddle/v2/framework/network.py @@ -0,0 +1,124 @@ +import paddle.v2.framework.core as core +from paddle.v2.framework.create_op_creation_methods import op_creations +from default_scope_funcs import create_var, get_var, get_cur_scope + +__all__ = ['Network'] # Only expose Network + + +class NetworkFunctor(object): + """ + Network Op Creation Function. Used internally in this module. + It convert string input to Variable. If it is not created before, just + create in scope. + + It is a functor object. means the instances are callable. + + :param func: The op creation function which generated in Python. + :param net: The Network instance. + """ + + def __init__(self, func, net): + self.func = func + self.net = net + + def __call__(self, *args, **kwargs): + if len(args) != 0: + raise ValueError("Paddle must use keyword argument") + inputs = self.func.all_input_args + for ipt in inputs: + if ipt in kwargs: + var = kwargs[ipt] + if isinstance(var, basestring): + var = create_var(var) + if not isinstance(var, core.Variable): + raise TypeError( + "Input of op creation must be string or variable") + + kwargs[ipt] = get_cur_scope().get_var_name(var) + + notemp_outputs = self.func.all_not_temp_output_args + + for name in notemp_outputs: + if name not in kwargs: + kwargs[ + name] = self.func.__name__ + "@OUT@%d" % core.unique_integer( + ) + + outputs = self.func.all_output_args + for opt in outputs: + if opt in kwargs: + var = kwargs[opt] + if isinstance(var, basestring): + var = create_var(var) + if not isinstance(var, core.Variable): + raise TypeError( + "Output of op creation must be string or variable") + kwargs[opt] = get_cur_scope().get_var_name(var) + + op = self.func(**kwargs) + + self.net.net.add_op(op) + + lst = [get_var(kwargs[opt]) for opt in notemp_outputs] + if len(lst) == 1: + return lst[0] + elif len(lst) == 0: + return None + else: + return lst + + +class Network(object): + """ + The network concept. It avoid user to manually create operator, create + variable, and combine them into a Net. Just use Network.xxx can create the + operator, create variables in default scope, and add them into `self.net`. + + For example: + + .. code-block: python + + net = Network() + out = net.add_two(X="a", Y="b") + fc_out = net.fc(X="out", W="fc.w") + + net.run(...) + """ + + def __init__(self): + self.net = core.Net.create() + funcs = (func_name for func_name in dir(op_creations) + if not func_name.startswith("__")) + + # TODO(yuyang18): This code can work, but do not generate a good + # docstring, try to give a better way generate function in runtime + # later. + for func_name in funcs: + func = getattr(op_creations, func_name) + impl = NetworkFunctor(func, self) + setattr(self, func_name, impl.__call__) + self.__complete_add_op__ = False + + def infer_shape(self): + self.complete_add_op() + self.net.infer_shape(get_cur_scope()) + + def run(self, device_context): + self.complete_add_op() + self.net.run(get_cur_scope(), device_context) + + def __str__(self): + return str(self.net) + + def complete_add_op(self): + if not self.__complete_add_op__: + self.net.complete_add_op() + self.__complete_add_op__ = True + + +if __name__ == '__main__': + net = Network() + out = net.add_two(X="a", Y="b") + fc_out = net.fc(X=out, W="fc.w", b="fc.b", activation="softmax") + net.complete_add_op() + print net diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt index b3eb2ef8a8..cdaaa60674 100644 --- a/python/paddle/v2/framework/tests/CMakeLists.txt +++ b/python/paddle/v2/framework/tests/CMakeLists.txt @@ -3,7 +3,7 @@ add_python_test(test_framework test_scope.py test_default_scope_funcs.py test_op_creation_methods.py - test_plain_net.py + test_net.py test_tensor.py test_fc_op.py test_add_two_op.py @@ -12,4 +12,5 @@ add_python_test(test_framework test_mul_op.py test_sigmoid_op.py test_softmax_op.py - test_rowwise_add_op.py) + test_rowwise_add_op.py + test_network.py) diff --git a/python/paddle/v2/framework/tests/test_plain_net.py b/python/paddle/v2/framework/tests/test_net.py similarity index 92% rename from python/paddle/v2/framework/tests/test_plain_net.py rename to python/paddle/v2/framework/tests/test_net.py index 2b919aca28..db776d6b64 100644 --- a/python/paddle/v2/framework/tests/test_plain_net.py +++ b/python/paddle/v2/framework/tests/test_net.py @@ -5,11 +5,11 @@ import unittest class TestNet(unittest.TestCase): def test_net_all(self): - net = core.PlainNet.create() + net = core.Net.create() op1 = op_creations.add_two(X="X", Y="Y", Out="Out") net.add_op(op1) - net2 = core.PlainNet.create() + net2 = core.Net.create() net2.add_op(op_creations.fc(X="X", W="w", Y="fc.out")) net2.complete_add_op(True) net.add_op(net2) diff --git a/python/paddle/v2/framework/tests/test_network.py b/python/paddle/v2/framework/tests/test_network.py new file mode 100644 index 0000000000..6d53e233e9 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_network.py @@ -0,0 +1,32 @@ +from paddle.v2.framework.network import Network +import paddle.v2.framework.core as core +import unittest + + +class TestNet(unittest.TestCase): + def test_net_all(self): + net = Network() + out = net.add_two(X="X", Y="Y") + fc_out = net.fc(X=out, W="w") + net.complete_add_op() + self.assertTrue(isinstance(fc_out, core.Variable)) + self.assertEqual( + '''Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, add_two@OUT@0, fc@OUT@1). + Op(add_two), inputs:(X, Y), outputs:(add_two@OUT@0). + Op(fc), inputs:(add_two@OUT@0, w, @EMPTY@), outputs:(fc@OUT@1, @TEMP@fc@0). + Op(mul), inputs:(add_two@OUT@0, w), outputs:(@TEMP@fc@0). + Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc@OUT@1). +''', str(net)) + + net2 = Network() + tmp = net2.add_two(X="X", Y="Y") + self.assertTrue(isinstance(tmp, core.Variable)) + net2.complete_add_op() + self.assertEqual( + '''Op(plain_net), inputs:(X, Y), outputs:(add_two@OUT@2). + Op(add_two), inputs:(X, Y), outputs:(add_two@OUT@2). +''', str(net2)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py new file mode 100644 index 0000000000..0457e3f16a --- /dev/null +++ b/python/paddle/v2/framework/tests/test_recurrent_op.py @@ -0,0 +1,92 @@ +import paddle.v2.framework.core as core +import unittest +import numpy as np +import paddle.v2.framework.create_op_creation_methods as creation + +ops = creation.op_creations + + +def create_tensor(scope, name, shape): + tensor = scope.create_var(name).get_tensor() + tensor.set_dims(shape) + tensor.alloc_float() + tensor.set(np.random.random(shape)) + return tensor + + +class TestRNN(unittest.TestCase): + ''' + Test RNNOp + + equation: + h_t = \sigma (W x_t + U h_{t-1}) + weights: + - W + - U + vars: + - x + memories: + - h + outputs: + - h + ''' + + def init(self): + input_dim = 30 + batch_size = 50 + weight_dim = 15 + + self.scope = core.Scope(None) + + # create vars + create_tensor(self.scope, "x", [batch_size, input_dim]) + create_tensor(self.scope, "W", [input_dim, weight_dim]) + create_tensor(self.scope, "U", [weight_dim, weight_dim]) + create_tensor(self.scope, "h_boot", [batch_size, weight_dim]) + + x_alias = "x@alias" + y_alias = "y@alias" + memory = "h@alias" + prememory = "h@pre" + output = "rnn_out" + output_alias = "rnn_out@alias" + + # create step net + stepnet_var = self.scope.create_var("stepnet") + stepnet = stepnet_var.get_net() + # stepnet = core.Net.create() + x_fc_op = ops.fc(X=x_alias, W="W", Y="Wx") + h_fc_op = ops.fc(X=prememory, W="U", Y="Uh") + sum_op = ops.add_two(X="Wx", Y="Uh", Out="sum") + sig_op = ops.sigmoid(X="sum", Y=memory) + stepnet.add_op(x_fc_op) + stepnet.add_op(h_fc_op) + stepnet.add_op(sum_op) + stepnet.add_op(sig_op) + stepnet.complete_add_op(True) + + # create RNNOp + rnnop = ops.recurrent_op( + # inputs + inlinks=["x"], + boot_memories=["h_boot"], + step_net="stepnet", + # outputs + outlinks=[output], + step_scopes="step_scopes", + # attributes + inlink_alias=["x@alias"], + outlink_alias=[output_alias], + pre_memories=[prememory], + memories=[memory]) + + ctx = core.DeviceContext.cpu_context() + rnnop.infer_shape(self.scope) + rnnop.run(self.scope, ctx) + + def test_recurrent(self): + self.init() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py index 40134a3270..4dcc3ab57e 100644 --- a/python/paddle/v2/inference.py +++ b/python/paddle/v2/inference.py @@ -35,6 +35,13 @@ class Inference(object): name = param.getName() assert isinstance(val, api.Vector) val.copyFromNumpyArray(parameters.get(name).flatten()) + # the setValueUpdated function is called in randomize, zeroMem, + # load function in paddle/parameter/Parameter.cpp. But in the + # inference mode, the setValueUpdated is never called, it will + # cause the parameter will not be dispatched + # in MultiGradientMachine for multi-GPU. So setValueUpdated is + # called here, but it's better to call this function in one place. + param.setValueUpdated() self.__gradient_machine__ = gm self.__data_types__ = topo.data_type() diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py index 4ade1c6f32..6a2bb8d337 100644 --- a/python/paddle/v2/layer.py +++ b/python/paddle/v2/layer.py @@ -324,6 +324,3 @@ def parse_network(output_layers, extra_layers=None): def get_layer(name): return config_base.__layer_map__.get(name) - - -cp.begin_parse() diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py index 3ac62d116b..b658a81630 100644 --- a/python/paddle/v2/master/client.py +++ b/python/paddle/v2/master/client.py @@ -49,7 +49,6 @@ class client(object): def set_dataset(self, paths): holder_type = ctypes.c_char_p * len(paths) holder = holder_type() - print paths for idx, path in enumerate(paths): c_ptr = ctypes.c_char_p(path) holder[idx] = c_ptr