Paddle/go/pserver/service.go

package pserver

import (
	"bufio"
	"bytes"
	"crypto/md5"
	"encoding/gob"
	"encoding/hex"
	"errors"
	"fmt"
	"os"
	"strconv"
	"sync"
	"time"

	log "github.com/sirupsen/logrus"
)

// ElementType is the type of elements of a Parameter.
type ElementType int

const (
	AlreadyInitialized = "pserver already initialized"
	Uninitialized      = "pserver not fully initialized"
)

const (
	checkpoint_path = "/checkpoints/"
)

// Supported element types
const (
	Int32 ElementType = iota
	UInt32
	Int64
	UInt64
	Float32
	Float64
)

// PsDesired is etcd path for store desired pserver count
const PsDesired = "/ps_desired"

// Parameter is a piece of data to sync with the parameter server.
type Parameter struct {
	Name        string
	ElementType ElementType
	Content     []byte
}

// ParameterWithConfig contains the parameter and the configuration.
type ParameterWithConfig struct {
	Param  Parameter
	Config []byte // parameter configuration in Proto Buffer format
	State  []byte // parameter training state
}

// Gradient is the gradient of the parameter.
type Gradient Parameter

// Service is the RPC service for pserver.
type Service struct {
	initialized chan struct{}
	idx         int

	mu     sync.Mutex
	optMap map[string]*optimizer
}

type Checkpoint struct {
	uuid      string
	md5sum    string
	timestamp string
}

//serialize ParameterWithConfig to byte stream
func GetBytes(content ...interface{}) ([]byte, error) {

	var buf bytes.Buffer
	encoder := gob.NewEncoder(&buf)
	err := encoder.Encode(content)
	if err != nil {
		return nil, err
	}
	return buf.Bytes(), nil
}

// NewService creates a new service, will bypass etcd registration if no
// endpoints specified.
func NewService(idx int) (*Service, error) {
	s := &Service{
		idx: idx,
	}
	s.optMap = make(map[string]*optimizer)
	s.initialized = make(chan struct{})
	return s, nil
}

// InitParam initializes a parameter.
func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
	select {
	case <-s.initialized:
		return errors.New(AlreadyInitialized)
	default:
	}

	// TODO(helin): parse parameter config

	s.mu.Lock()
	defer s.mu.Unlock()

	// TODO(helin): check if paramWithConfigs.Param.Content is
	// properly memory aligned, if not, make copy to a memory
	// aligned region.
	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs)
	return nil
}

// FinishInitParams tells the parameter server that the parameter
// initialization has finished.
func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
	select {
	case <-s.initialized:
		return errors.New(AlreadyInitialized)
	default:
	}

	close(s.initialized)
	return nil
}

// SendGrad sends gradient to parameter servers for parameter
// optimization.
func (s *Service) SendGrad(g Gradient, dummy *int) error {
	select {
	case <-s.initialized:
	default:
		return errors.New(Uninitialized)
	}

	s.mu.Lock()
	defer s.mu.Unlock()

	o, ok := s.optMap[g.Name]
	if !ok {
		return fmt.Errorf("parameter: %s does not exist", g.Name)
	}

	return o.UpdateParameter(g)
}

// GetParam gets parameters from the parameter server.
func (s *Service) GetParam(name string, parameter *Parameter) error {
	<-s.initialized
	s.mu.Lock()
	defer s.mu.Unlock()

	opt, ok := s.optMap[name]
	if !ok {
		return fmt.Errorf("parameter: %s does not exist", name)
	}

	// The parameter content (a byte slice) may change
	// during RPC serialization due to write from other
	// goroutine, we allow it since mini-batch based deep
	// learning optimization methods are stochastic in
	// nature. This race condition is allowed deliberately
	// to save the program from making a copy of the
	// paramter content.
	parameter.Name = name
	parameter.ElementType = opt.elementType
	parameter.Content = opt.GetWeights()
	return nil
}

// Save tells the parameter server to save parameters.
func (s *Service) Save(path string, dummy *int) error {
	//FIXME: checkpoint is only used by pserver
	// and has a constant path of */checkpoints/{pserver_idx}*
	<-s.initialized
	s.mu.Lock()
	defer s.mu.Unlock()
	var paramWithConfig ParameterWithConfig
	for name, opt := range s.optMap {
		paramWithConfig.Param.Name = name
		paramWithConfig.Param.ElementType = opt.elementType
		paramWithConfig.Param.Content = opt.GetWeights()
		paramWithConfig.State = opt.GetStates()
		content, err := GetBytes(paramWithConfig)
		if err != nil {
			log.Errorln(err)
		}
		ck := Checkpoint{}
		h := md5.New()
		ck.md5sum = hex.EncodeToString(h.Sum(content))
		ck.timestamp = time.Now().String()
		ck.uuid = checkpoint_path + strconv.Itoa(s.idx)
		ckbytes, err := GetBytes(ck)
		if err != nil {
			log.Errorln(err)
		}
		// TODO: according design doc, need to save uuid to etcd in json format
		// {\"uuid\": [UUID], \"md5\", \"MD5 sum\", \"timestamp\": xxxx}
		log.Infof("parameter checkpoint %s", ckbytes)

		if _, err = os.Stat(ck.uuid); os.IsNotExist(err) {
			log.Info("checkpoint not exists.")
		} else {
			err = os.Remove(ck.uuid)
			log.Infof("remove %s", ck.uuid)
		}
		f, err := os.Create(ck.uuid)
		defer f.Close()
		if err != nil {
			log.Errorln(err)
		}
		writer := bufio.NewWriter(f)
		_, err = writer.Write(content)
		if err != nil {
			log.Errorln(err)
		}
	}
	return nil
}