Merge conflicts

cblas_new
liaogang 8 years ago
commit 365b457a04

6
.gitignore vendored

@ -19,3 +19,9 @@ third_party/
# clion workspace.
cmake-build-*
# generated while compiling
python/paddle/v2/framework/core.so
CMakeFiles
cmake_install.cmake

@ -4,6 +4,7 @@ cache:
- $HOME/.ccache
- $HOME/.cache/pip
- $TRAVIS_BUILD_DIR/build/third_party
- $TRAVIS_BUILD_DIR/build_android/third_party
sudo: required
dist: trusty
os:
@ -11,6 +12,7 @@ os:
env:
- JOB=build_doc
- JOB=check_style
- JOB=build_android
addons:
apt:
packages:

@ -28,7 +28,9 @@ if(NOT CMAKE_CROSSCOMPILING)
endif(NOT CMAKE_CROSSCOMPILING)
find_package(Git REQUIRED)
find_package(Threads REQUIRED)
find_package(Boost QUIET)
if(NOT ANDROID)
find_package(Boost QUIET)
endif()
include(simd)
@ -97,6 +99,7 @@ include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc
include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(cudnn) # set cudnn libraries, must before configure
include(configure) # add paddle env configuration
@ -139,6 +142,10 @@ endif(USE_NNPACK)
add_subdirectory(proto)
# "add_subdirectory(go)" should be placed after the following loine,
# because it depends on paddle/optimizer.
add_subdirectory(paddle/optimizer)
# "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
# placed after this block, because they depends on it.
if(WITH_GOLANG)
@ -146,7 +153,9 @@ if(WITH_GOLANG)
endif(WITH_GOLANG)
add_subdirectory(paddle)
add_subdirectory(python)
if(WITH_PYTHON)
add_subdirectory(python)
endif()
if(WITH_DOC)
add_subdirectory(doc)
endif()

@ -106,6 +106,9 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
ENDIF()
ENDIF()
IF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
ENDIF()
SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
ENDIF()
@ -162,6 +165,10 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
ENDIF()
ENDIF()
IF(ANDROID_ABI STREQUAL "arm64-v8a")
LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
ENDIF()
STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}")

@ -52,6 +52,7 @@ ExternalProject_Add(
ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
ADD_DEPENDENCIES(glog extern_glog)
ADD_DEPENDENCIES(glog extern_glog gflags)
LINK_LIBRARIES(glog gflags)
LIST(APPEND external_project_dependencies glog)

@ -32,7 +32,12 @@ IF(NOT ${CBLAS_FOUND})
# arm_soft_fp_abi branch of OpenBLAS to support softfp
# https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
SET(TARGET "ARMV7")
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(TARGET "ARMV8")
ENDIF()
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(RPI)
# use hardfp
SET(OPENBLAS_COMMIT "v0.2.19")

@ -0,0 +1,30 @@
INCLUDE(ExternalProject)
SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
ExternalProject_Add(
extern_pybind
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/pybind/pybind11.git"
GIT_TAG "v2.1.1"
PREFIX ${PYBIND_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
add_library(pybind STATIC ${dummyfile})
else()
add_library(pybind INTERFACE)
endif()
add_dependencies(pybind extern_pybind)
LIST(APPEND external_project_dependencies pybind)

@ -18,6 +18,9 @@ INCLUDE(python_module)
FIND_PACKAGE(PythonInterp 2.7)
IF(WITH_PYTHON)
FIND_PACKAGE(PythonLibs 2.7)
# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
ENDIF(WITH_PYTHON)
SET(py_env "")

@ -109,7 +109,9 @@ set(COMMON_FLAGS
-Wno-unused-function
-Wno-error=literal-suffix
-Wno-error=sign-compare
-Wno-error=unused-local-typedefs)
-Wno-error=unused-local-typedefs
-Wno-error=parentheses-equality # Warnings in Pybind11
)
set(GPU_COMMON_FLAGS
-fPIC

@ -90,11 +90,11 @@
# including binary directory for generated headers.
include_directories(${CMAKE_CURRENT_BINARY_DIR})
if(NOT APPLE)
if(NOT APPLE AND NOT ANDROID)
find_package(Threads REQUIRED)
link_libraries(${CMAKE_THREAD_LIBS_INIT})
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl")
endif(NOT APPLE)
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt")
endif(NOT APPLE AND NOT ANDROID)
function(merge_static_libs TARGET_NAME)
set(libs ${ARGN})
@ -302,7 +302,7 @@ function(go_library TARGET_NAME)
file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
# FIXME: link path
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
# Golang build source code
@ -310,7 +310,7 @@ function(go_library TARGET_NAME)
-o "${${TARGET_NAME}_LIB_PATH}"
"./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
# must run under GOPATH
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
add_dependencies(${TARGET_NAME} go_vendor)
endfunction(go_library)
@ -321,14 +321,11 @@ function(go_binary TARGET_NAME)
cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
# FIXME: link path
add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH}
GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
-o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
"./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
# TODO: don't know what ${TARGET_NAME}_link does
add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS})
install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
endfunction(go_binary)
@ -336,15 +333,18 @@ endfunction(go_binary)
function(go_test TARGET_NAME)
set(options OPTIONAL)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
set(multiValueArgs DEPS)
cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
-c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
${go_test_SRCS}
".${CMAKE_CURRENT_SOURCE_REL_DIR}"
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
add_test(NAME ${TARGET_NAME}
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})
add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
endfunction(go_test)
function(proto_library TARGET_NAME)

@ -37,7 +37,7 @@
\frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
假设 :math:`z = f(W^T x + b)` ,那么
假设 :math:`z = W^T x + b` ,那么
.. math::

@ -29,7 +29,7 @@ Fully connected layer takes a dense input vector with dimension :math:`D_i`. It
where :math:`f(.)` is an nonlinear *activation* function, such as sigmoid, tanh, and Relu.
The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter.
The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter.
Suppose our loss function is :math:`c(y)`, then
@ -37,7 +37,7 @@ Suppose our loss function is :math:`c(y)`, then
\frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
Suppose :math:`z = f(W^T x + b)`, then
Suppose :math:`z = W^T x + b`, then
.. math::
@ -48,7 +48,7 @@ This derivative can be automatically computed by our base layer class.
Then, for fully connected layer, we need to compute:
.. math::
\frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1
where :math:`\mathbf 1` is an all one vector, :math:`W_{ij}` is the number at the i-th row and j-th column of the matrix :math:`W`, :math:`z_j` is the j-th component of the vector :math:`z`, and :math:`x_i` is the i-th component of the vector :math:`x`.
@ -322,7 +322,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
/* weight */ true);
}
}
If you are creating a new file for the test, such as :code:`paddle/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.
.. code-block:: bash

@ -17,3 +17,7 @@ add_subdirectory(pserver/client/c)
add_subdirectory(cmd/pserver)
add_subdirectory(cmd/master)
add_subdirectory(master/c)
add_subdirectory(master)
add_subdirectory(pserver)
add_subdirectory(pserver/client)
add_subdirectory(utils/networkhelper)

@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
go_binary(master SRC master.go DEPS paddle_go_optimizer)
go_binary(master SRC master.go)

@ -8,6 +8,7 @@ import (
"time"
"github.com/namsral/flag"
"github.com/topicai/candy"
"github.com/PaddlePaddle/Paddle/go/pserver"
log "github.com/sirupsen/logrus"
@ -18,53 +19,47 @@ func main() {
index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0")
etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
"comma separated endpoint string for pserver to connect to etcd")
etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls")
etcdTimeout := flag.Duration("etcd-timeout", 5*time.Second, "timeout for etcd calls")
numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
checkpointInterval := flag.Int("checkpoint-interval", 600, "save checkpoint per interval seconds")
checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds")
logLevel := flag.String("log-level", "info",
"log level, possible values: debug, info, warning, error, fatal, panic")
flag.Parse()
level, err := log.ParseLevel(*logLevel)
if err != nil {
panic(err)
}
candy.Must(err)
log.SetLevel(level)
var idx int
var cp pserver.Checkpoint
var e *pserver.EtcdClient
if *index >= 0 {
idx = *index
} else {
timeout := time.Second * time.Duration((*etcdTimeout))
e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout)
idx, err = e.Register()
candy.Must(err)
cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)
if err != nil {
panic(err)
log.Errorf("Fetch checkpoint failed, %s", err)
}
}
s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp)
if err != nil {
panic(err)
}
candy.Must(err)
err = rpc.Register(s)
if err != nil {
panic(err)
}
candy.Must(err)
rpc.HandleHTTP()
l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
if err != nil {
panic(err)
}
candy.Must(err)
log.Infof("start pserver at port %d", *port)
err = http.Serve(l, nil)
if err != nil {
panic(err)
}
candy.Must(err)
}

14
go/glide.lock generated

@ -1,8 +1,8 @@
hash: b8f18ce6784bd3fadd9fed0b8443e7b658234ea785ae1f220723ae2c1f652aa7
updated: 2017-06-27T14:05:48.925262819+08:00
hash: a8faea3a363468a88917ddeb3b1c9ea36886fb2c622acbad42604fa9cb4d3855
updated: 2017-07-11T10:04:40.786745417+08:00
imports:
- name: github.com/coreos/etcd
version: 61fc123e7a8b14a0a258aa3f5c4159861b1ec2e7
version: cb2a496c4ddd1c87a9f280e116649b599999ec79
subpackages:
- auth/authpb
- clientv3
@ -22,7 +22,9 @@ imports:
- name: github.com/PaddlePaddle/recordio
version: edfb82af0739c84f241c87390ec5649c7b28c129
- name: github.com/sirupsen/logrus
version: 202f25545ea4cf9b191ff7f846df5d87c9382c2b
version: 7f976d3a76720c4c27af2ba716b85d2e0a7e38b1
- name: github.com/topicai/candy
version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
- name: golang.org/x/net
version: c8c74377599bd978aee1cf3b9b63a8634051cec2
subpackages:
@ -34,11 +36,11 @@ imports:
- lex/httplex
- trace
- name: golang.org/x/sys
version: f7928cfef4d09d1b080aa2b6fd3ca9ba1567c733
version: abf9c25f54453410d0c6668e519582a9e1115027
subpackages:
- unix
- name: golang.org/x/text
version: 4e9ab9ee170f2a39bd66c92b3e0a47ff47a4bc77
version: cfdf022e86b4ecfb646e1efbd7db175dd623a8fa
subpackages:
- secure/bidirule
- transform

@ -10,3 +10,4 @@ import:
version: ^1.7.4-pre
- package: github.com/sirupsen/logrus
version: ^1.0.0
- package: github.com/topicai/candy

@ -0,0 +1,3 @@
if(WITH_TESTING)
go_test(master_test)
endif()

@ -68,7 +68,7 @@ func (c *Client) getRecords() {
// We treat a task as finished whenever the last data
// instance of the task is read. This is not exactly
// correct, but a reasonable approximation.
c.taskFinished(t.ID)
c.taskFinished(t.Meta.ID)
}
}
@ -118,6 +118,11 @@ func (c *Client) taskFinished(taskID int) error {
return c.conn.Call("Service.TaskFinished", taskID, nil)
}
// TaskFailed tell the master server as task is failed.
func (c *Client) taskFailed(meta TaskMeta) error {
return c.conn.Call("Service.TaskFailed", meta, nil)
}
// NextRecord returns next record in the dataset.
//
// NextRecord will block until the next record is available. It is

@ -95,10 +95,16 @@ func TestGetFinishTask(t *testing.T) {
t.Fatalf("Should get error, pass: %d\n", i)
}
err = c.taskFinished(tasks[0].ID)
err = c.taskFinished(tasks[0].Meta.ID)
if err != nil {
t.Fatalf("Error: %v, pass: %d\n", err, i)
}
err = c.taskFailed(tasks[0].Meta)
if err != nil {
t.Fatalf("Error: %v, pass: %d\n", err, i)
}
tasks = tasks[1:]
task, err := c.getTask()
if err != nil {
@ -107,7 +113,7 @@ func TestGetFinishTask(t *testing.T) {
tasks = append(tasks, task)
for _, task := range tasks {
err = c.taskFinished(task.ID)
err = c.taskFinished(task.Meta.ID)
if err != nil {
t.Fatalf("Error: %v, pass: %d\n", err, i)
}

@ -31,30 +31,36 @@ type Chunk struct {
Index recordio.Index // chunk index
}
// TaskMeta is a struct which stores task's meta info.
type TaskMeta struct {
ID int
Epoch int
}
// Task is the basic unit of data instances assigned to trainers.
type Task struct {
ID int
Meta TaskMeta
Chunks []Chunk
}
type taskEntry struct {
Epoch int
NumTimeout int
Task Task
Task Task
// A task fails if it's timeout or trainer reports it exits unnormally.
NumFailure int
}
type taskQueues struct {
Todo []taskEntry
Pending map[int]taskEntry // map from task ID to task entry
Done []taskEntry
Failed []Task
Failed []taskEntry
}
// Service is the master server service.
type Service struct {
chunksPerTask int
timeoutDur time.Duration
timeoutMax int
failureMax int
ready chan struct{}
store Store
@ -73,7 +79,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
var cur taskEntry
for i, c := range chunks {
if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 {
cur.Task.ID = id
cur.Task.Meta.ID = id
id++
result = append(result, cur)
cur.Task.Chunks = nil
@ -83,7 +89,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
}
if len(cur.Task.Chunks) > 0 {
cur.Task.ID = id
cur.Task.Meta.ID = id
result = append(result, cur)
}
@ -91,11 +97,11 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
}
// NewService creates a new service.
func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) (*Service, error) {
func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failureMax int) (*Service, error) {
s := &Service{}
s.chunksPerTask = chunksPerTask
s.timeoutDur = timeoutDur
s.timeoutMax = timeoutMax
s.failureMax = failureMax
s.taskQueues = taskQueues{}
s.taskQueues.Pending = make(map[int]taskEntry)
s.ready = make(chan struct{})
@ -257,6 +263,34 @@ func (s *Service) SetDataset(globPaths []string, dummy *int) error {
return nil
}
func (s *Service) processFailedTask(t taskEntry, epoch int) {
if t.Task.Meta.Epoch != epoch {
// new epoch, task launched after the
// schedule of this timeout check or failed status report.
return
}
defer func() {
err := s.snapshot()
if err != nil {
log.Errorln(err)
}
}()
delete(s.taskQueues.Pending, t.Task.Meta.ID)
t.NumFailure++
if t.NumFailure > s.failureMax {
log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
s.taskQueues.Failed = append(s.taskQueues.Failed, t)
return
}
log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
s.taskQueues.Todo = append(s.taskQueues.Todo, t)
return
}
func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
return func() {
s.mu.Lock()
@ -267,30 +301,7 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
return
}
if t.Epoch != epoch {
// new epoch, task launched after the
// schedule of this timeout check.
return
}
defer func() {
err := s.snapshot()
if err != nil {
log.Errorln(err)
}
}()
delete(s.taskQueues.Pending, t.Task.ID)
t.NumTimeout++
if t.NumTimeout > s.timeoutMax {
log.Warningf("Task %v timed out %d times, discard.", t.Task, t.NumTimeout)
s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
return
}
log.Warningf("Task %v timed out %d times, retry.", t.Task, t.NumTimeout)
s.taskQueues.Todo = append(s.taskQueues.Todo, t)
s.processFailedTask(t, epoch)
}
}
@ -339,18 +350,18 @@ func (s *Service) GetTask(dummy int, task *Task) error {
}
t := s.taskQueues.Todo[0]
t.Epoch++
t.Task.Meta.Epoch++
s.taskQueues.Todo = s.taskQueues.Todo[1:]
s.taskQueues.Pending[t.Task.ID] = t
s.taskQueues.Pending[t.Task.Meta.ID] = t
err := s.snapshot()
if err != nil {
return err
}
*task = t.Task
log.WithFields(s.logFields()).Infof("Task #%d dispatched.", task.ID)
log.WithFields(s.logFields()).Infof("Task #%v dispatched.", t.Task.Meta)
time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Epoch))
time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
return nil
}
@ -365,13 +376,12 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
t, ok := s.taskQueues.Pending[taskID]
if !ok {
err := errors.New("pending task not found")
log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
return err
return nil
}
// task finished, reset timeout
t.NumTimeout = 0
t.NumFailure = 0
s.taskQueues.Done = append(s.taskQueues.Done, t)
delete(s.taskQueues.Pending, taskID)
@ -389,3 +399,22 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
}
return err
}
// TaskFailed tells the service that a task is failed.
func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
select {
case <-s.ready:
}
s.mu.Lock()
defer s.mu.Unlock()
t, ok := s.taskQueues.Pending[meta.ID]
if !ok {
log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Task.Meta)
return nil
}
s.processFailedTask(t, meta.Epoch)
return nil
}

@ -30,7 +30,7 @@ func TestPartionIndex(t *testing.T) {
cs := make([]Chunk, 100)
ts := partition(cs, 20)
for i := range ts {
if ts[i].Task.ID != i {
if ts[i].Task.Meta.ID != i {
t.Error(ts[i], i)
}
}

@ -0,0 +1,3 @@
if(WITH_TESTING)
go_test(pserver_test DEPS paddle_go_optimizer)
endif()

@ -0,0 +1,3 @@
if(WITH_TESTING)
go_test(pserver_client_test DEPS paddle_go_optimizer)
endif()

@ -0,0 +1 @@
libpaddle_go_optimizer.a

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save