You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1618 lines
63 KiB
1618 lines
63 KiB
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Fleet Utils."""
|
|
|
|
import collections
|
|
import json
|
|
import logging
|
|
import math
|
|
import numpy as np
|
|
import os
|
|
import sys
|
|
import time
|
|
import paddle.fluid as fluid
|
|
from paddle.fluid.log_helper import get_logger
|
|
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet as fleet_pslib
|
|
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet as fleet_transpiler
|
|
from . import hdfs
|
|
from .hdfs import *
|
|
from . import utils
|
|
|
|
__all__ = ["FleetUtil"]
|
|
|
|
_logger = get_logger(
|
|
__name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
|
|
|
|
fleet = fleet_pslib
|
|
|
|
|
|
class FleetUtil(object):
|
|
"""
|
|
FleetUtil provides some common functions for users' convenience.
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.rank0_print("my log")
|
|
|
|
"""
|
|
|
|
def __init__(self, mode="pslib"):
|
|
global fleet
|
|
if mode == "pslib":
|
|
fleet = fleet_pslib
|
|
elif mode == "transpiler":
|
|
fleet = fleet_transpiler
|
|
else:
|
|
raise ValueError(
|
|
"Please choose one mode from [\"pslib\", \"transpiler\"]")
|
|
|
|
def rank0_print(self, s):
|
|
"""
|
|
Worker of rank 0 print some log.
|
|
|
|
Args:
|
|
s(str): string to print
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.rank0_print("my log")
|
|
|
|
"""
|
|
if fleet.worker_index() != 0:
|
|
return
|
|
print(s)
|
|
sys.stdout.flush()
|
|
|
|
def rank0_info(self, s):
|
|
"""
|
|
Worker of rank 0 print some log info.
|
|
|
|
Args:
|
|
s(str): string to log
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.rank0_info("my log info")
|
|
|
|
"""
|
|
if fleet.worker_index() != 0:
|
|
return
|
|
_logger.info(s)
|
|
|
|
def rank0_error(self, s):
|
|
"""
|
|
Worker of rank 0 print some log error.
|
|
|
|
Args:
|
|
s(str): string to log
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.rank0_error("my log error")
|
|
|
|
"""
|
|
if fleet.worker_index() != 0:
|
|
return
|
|
_logger.error(s)
|
|
|
|
def set_zero(self,
|
|
var_name,
|
|
scope=fluid.global_scope(),
|
|
place=fluid.CPUPlace(),
|
|
param_type="int64"):
|
|
"""
|
|
Set tensor of a Variable to zero.
|
|
|
|
Args:
|
|
var_name(str): name of Variable
|
|
scope(Scope): Scope object, default is fluid.global_scope()
|
|
place(Place): Place object, default is fluid.CPUPlace()
|
|
param_type(str): param data type, default is int64
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.set_zero(myvar.name, myscope)
|
|
|
|
"""
|
|
param = scope.var(var_name).get_tensor()
|
|
param_array = np.zeros(param._get_dims()).astype(param_type)
|
|
param.set(param_array, place)
|
|
|
|
def print_global_auc(self,
|
|
scope=fluid.global_scope(),
|
|
stat_pos="_generated_var_2",
|
|
stat_neg="_generated_var_3",
|
|
print_prefix=""):
|
|
"""
|
|
Print global auc of all distributed workers.
|
|
|
|
Args:
|
|
scope(Scope): Scope object, default is fluid.global_scope()
|
|
stat_pos(str): name of auc pos bucket Variable
|
|
stat_neg(str): name of auc neg bucket Variable
|
|
print_prefix(str): prefix of print auc
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.print_global_auc(myscope, stat_pos=stat_pos.name,
|
|
stat_neg=stat_neg.name)
|
|
|
|
# below is part of model
|
|
emb = my_slot_net(slots, label) # emb can be fc layer of size 1
|
|
similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
|
|
emb, min=-15.0, max=15.0), name="similarity_norm")\
|
|
binary_predict = fluid.layers.concat(input=[\
|
|
fluid.layers.elementwise_sub(\
|
|
fluid.layers.ceil(similarity_norm), similarity_norm),\
|
|
similarity_norm], axis=1)
|
|
auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, \
|
|
stat_neg] = fluid.layers.auc(input=binary_predict,\
|
|
label=label, curve='ROC',\
|
|
num_thresholds=4096)
|
|
|
|
"""
|
|
auc_value = self.get_global_auc(scope, stat_pos, stat_neg)
|
|
self.rank0_print(print_prefix + " global auc = %s" % auc_value)
|
|
|
|
def get_global_auc(self,
|
|
scope=fluid.global_scope(),
|
|
stat_pos="_generated_var_2",
|
|
stat_neg="_generated_var_3"):
|
|
"""
|
|
Get global auc of all distributed workers.
|
|
|
|
Args:
|
|
scope(Scope): Scope object, default is fluid.global_scope()
|
|
stat_pos(str): name of auc pos bucket Variable
|
|
stat_neg(str): name of auc neg bucket Variable
|
|
|
|
Returns:
|
|
auc_value(float), total_ins_num(int)
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
auc_value, _ = fleet_util.get_global_auc(myscope,
|
|
stat_pos=stat_pos,
|
|
stat_neg=stat_neg)
|
|
|
|
"""
|
|
if scope.find_var(stat_pos) is None or scope.find_var(stat_neg) is None:
|
|
self.rank0_print("not found auc bucket")
|
|
return None
|
|
fleet._role_maker._barrier_worker()
|
|
# auc pos bucket
|
|
pos = np.array(scope.find_var(stat_pos).get_tensor())
|
|
# auc pos bucket shape
|
|
old_pos_shape = np.array(pos.shape)
|
|
# reshape to one dim
|
|
pos = pos.reshape(-1)
|
|
global_pos = np.copy(pos) * 0
|
|
# mpi allreduce
|
|
fleet._role_maker._all_reduce(pos, global_pos)
|
|
# reshape to its original shape
|
|
global_pos = global_pos.reshape(old_pos_shape)
|
|
|
|
# auc neg bucket
|
|
neg = np.array(scope.find_var(stat_neg).get_tensor())
|
|
old_neg_shape = np.array(neg.shape)
|
|
neg = neg.reshape(-1)
|
|
global_neg = np.copy(neg) * 0
|
|
fleet._role_maker._all_reduce(neg, global_neg)
|
|
global_neg = global_neg.reshape(old_neg_shape)
|
|
|
|
# calculate auc
|
|
num_bucket = len(global_pos[0])
|
|
area = 0.0
|
|
pos = 0.0
|
|
neg = 0.0
|
|
new_pos = 0.0
|
|
new_neg = 0.0
|
|
total_ins_num = 0
|
|
for i in xrange(num_bucket):
|
|
index = num_bucket - 1 - i
|
|
new_pos = pos + global_pos[0][index]
|
|
total_ins_num += global_pos[0][index]
|
|
new_neg = neg + global_neg[0][index]
|
|
total_ins_num += global_neg[0][index]
|
|
area += (new_neg - neg) * (pos + new_pos) / 2
|
|
pos = new_pos
|
|
neg = new_neg
|
|
|
|
auc_value = None
|
|
if pos * neg == 0 or total_ins_num == 0:
|
|
auc_value = 0.5
|
|
else:
|
|
auc_value = area / (pos * neg)
|
|
|
|
fleet._role_maker._barrier_worker()
|
|
return auc_value
|
|
|
|
def load_fleet_model_one_table(self, table_id, path):
|
|
"""
|
|
load pslib model to one table
|
|
|
|
Args:
|
|
table_id(int): load model to one table, default is None, which mean
|
|
load all table.
|
|
path(str): model path
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.load_fleet_model("hdfs:/my/model/path", table_id=1)
|
|
"""
|
|
fleet.load_one_table(table_id, path)
|
|
|
|
def load_fleet_model(self, path, mode=0):
|
|
"""
|
|
load pslib model
|
|
|
|
Args:
|
|
path(str): model path
|
|
mode(str): 0 or 1, which means load checkpoint or delta model,
|
|
default is 0
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
|
|
fleet_util.load_fleet_model("hdfs:/my/model/path")
|
|
|
|
fleet_util.load_fleet_model("hdfs:/my/model/path", mode=0)
|
|
|
|
"""
|
|
fleet.init_server(path, mode=mode)
|
|
|
|
def save_fleet_model(self, path, mode=0):
|
|
"""
|
|
save pslib model
|
|
|
|
Args:
|
|
path(str): model path
|
|
mode(str): 0 or 1, which means save checkpoint or delta model,
|
|
default is 0
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.save_fleet_model("hdfs:/my/model/path")
|
|
|
|
"""
|
|
fleet.save_persistables(None, path, mode=mode)
|
|
|
|
def _get_xbox_str(self,
|
|
output_path,
|
|
day,
|
|
model_path,
|
|
xbox_base_key,
|
|
data_path,
|
|
hadoop_fs_name,
|
|
monitor_data={},
|
|
mode="patch"):
|
|
xbox_dict = collections.OrderedDict()
|
|
if mode == "base":
|
|
xbox_dict["id"] = str(xbox_base_key)
|
|
elif mode == "patch":
|
|
xbox_dict["id"] = str(int(time.time()))
|
|
else:
|
|
print("warning: unknown mode %s, set it to patch" % mode)
|
|
mode = "patch"
|
|
xbox_dict["id"] = str(int(time.time()))
|
|
xbox_dict["key"] = str(xbox_base_key)
|
|
if model_path.startswith("hdfs:") or model_path.startswith("afs:"):
|
|
model_path = model_path[model_path.find(":") + 1:]
|
|
xbox_dict["input"] = hadoop_fs_name + model_path.rstrip("/") + "/000"
|
|
xbox_dict["record_count"] = "111111"
|
|
xbox_dict["partition_type"] = "2"
|
|
xbox_dict["job_name"] = "default_job_name"
|
|
xbox_dict["ins_tag"] = "feasign"
|
|
xbox_dict["ins_path"] = data_path
|
|
job_id_with_host = os.popen("echo -n ${JOB_ID}").read().strip()
|
|
instance_id = os.popen("echo -n ${INSTANCE_ID}").read().strip()
|
|
start_pos = instance_id.find(job_id_with_host)
|
|
end_pos = instance_id.find("--")
|
|
if start_pos != -1 and end_pos != -1:
|
|
job_id_with_host = instance_id[start_pos:end_pos]
|
|
xbox_dict["job_id"] = job_id_with_host
|
|
# currently hard code here, set monitor_data empty string
|
|
xbox_dict["monitor_data"] = ""
|
|
xbox_dict["monitor_path"] = output_path.rstrip("/") + "/monitor/" \
|
|
+ day + ".txt"
|
|
xbox_dict["mpi_size"] = str(fleet.worker_num())
|
|
return json.dumps(xbox_dict)
|
|
|
|
def write_model_donefile(self,
|
|
output_path,
|
|
day,
|
|
pass_id,
|
|
xbox_base_key,
|
|
hadoop_fs_name,
|
|
hadoop_fs_ugi,
|
|
hadoop_home="$HADOOP_HOME",
|
|
donefile_name="donefile.txt"):
|
|
"""
|
|
write donefile when save model
|
|
|
|
Args:
|
|
output_path(str): output path
|
|
day(str|int): training day
|
|
pass_id(str|int): training pass id
|
|
xbox_base_key(str|int): xbox base key
|
|
hadoop_fs_name(str): hdfs/afs fs name
|
|
hadoop_fs_ugi(str): hdfs/afs fs ugi
|
|
hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
|
|
donefile_name(str): donefile name, default is "donefile.txt"
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.write_model_donefile(output_path="hdfs:/my/output",
|
|
model_path="hdfs:/my/model",
|
|
day=20190723,
|
|
pass_id=66,
|
|
xbox_base_key=int(time.time()),
|
|
hadoop_fs_name="hdfs://xxx",
|
|
hadoop_fs_ugi="user,passwd")
|
|
|
|
"""
|
|
day = str(day)
|
|
pass_id = str(pass_id)
|
|
xbox_base_key = int(xbox_base_key)
|
|
|
|
if pass_id != "-1":
|
|
suffix_name = "/%s/%s/" % (day, pass_id)
|
|
model_path = output_path.rstrip("/") + suffix_name
|
|
else:
|
|
suffix_name = "/%s/0/" % day
|
|
model_path = output_path.rstrip("/") + suffix_name
|
|
|
|
if fleet.worker_index() == 0:
|
|
donefile_path = output_path + "/" + donefile_name
|
|
content = "%s\t%lu\t%s\t%s\t%d" % (day, xbox_base_key,\
|
|
model_path, pass_id, 0)
|
|
configs = {
|
|
"fs.default.name": hadoop_fs_name,
|
|
"hadoop.job.ugi": hadoop_fs_ugi
|
|
}
|
|
client = HDFSClient(hadoop_home, configs)
|
|
if client.is_file(donefile_path):
|
|
pre_content = client.cat(donefile_path)
|
|
pre_content_list = pre_content.split("\n")
|
|
day_list = [i.split("\t")[0] for i in pre_content_list]
|
|
pass_list = [i.split("\t")[3] for i in pre_content_list]
|
|
exist = False
|
|
for i in range(len(day_list)):
|
|
if int(day) == int(day_list[i]) and \
|
|
int(pass_id) == int(pass_list[i]):
|
|
exist = True
|
|
break
|
|
if not exist:
|
|
with open(donefile_name, "w") as f:
|
|
f.write(pre_content + "\n")
|
|
f.write(content + "\n")
|
|
client.delete(donefile_path)
|
|
client.upload(
|
|
output_path,
|
|
donefile_name,
|
|
multi_processes=1,
|
|
overwrite=False)
|
|
self.rank0_error("write %s/%s %s succeed" % \
|
|
(day, pass_id, donefile_name))
|
|
else:
|
|
self.rank0_error("not write %s because %s/%s already "
|
|
"exists" % (donefile_name, day, pass_id))
|
|
else:
|
|
with open(donefile_name, "w") as f:
|
|
f.write(content + "\n")
|
|
client.upload(
|
|
output_path,
|
|
donefile_name,
|
|
multi_processes=1,
|
|
overwrite=False)
|
|
self.rank0_error("write %s/%s %s succeed" % \
|
|
(day, pass_id, donefile_name))
|
|
fleet._role_maker._barrier_worker()
|
|
|
|
def write_xbox_donefile(self,
|
|
output_path,
|
|
day,
|
|
pass_id,
|
|
xbox_base_key,
|
|
data_path,
|
|
hadoop_fs_name,
|
|
hadoop_fs_ugi,
|
|
monitor_data={},
|
|
hadoop_home="$HADOOP_HOME",
|
|
donefile_name=None):
|
|
"""
|
|
write delta donefile or xbox base donefile
|
|
|
|
Args:
|
|
output_path(str): output path
|
|
day(str|int): training day of model
|
|
pass_id(str|int): training pass id of model
|
|
xbox_base_key(str|int): xbox base key
|
|
data_path(str|list): training data path
|
|
hadoop_fs_name(str): hdfs/afs fs name
|
|
hadoop_fs_ugi(str): hdfs/afs fs ugi
|
|
monitor_data(dict): metrics
|
|
hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
|
|
donefile_name(str): donefile name, default is None"
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.write_xbox_donefile(
|
|
output_path="hdfs:/my/output/",
|
|
model_path="hdfs:/my/output/20190722/01",
|
|
day=20190722,
|
|
pass_id=1,
|
|
xbox_base_key=int(time.time()),
|
|
data_path="hdfs:/my/data/",
|
|
hadoop_fs_name="hdfs://xxx",
|
|
hadoop_fs_ugi="user,passwd",
|
|
monitor_data={}
|
|
)
|
|
|
|
"""
|
|
day = str(day)
|
|
pass_id = str(pass_id)
|
|
xbox_base_key = int(xbox_base_key)
|
|
mode = None
|
|
|
|
if pass_id != "-1":
|
|
mode = "patch"
|
|
suffix_name = "/%s/delta-%s/" % (day, pass_id)
|
|
model_path = output_path.rstrip("/") + suffix_name
|
|
if donefile_name is None:
|
|
donefile_name = "xbox_patch_done.txt"
|
|
else:
|
|
mode = "base"
|
|
suffix_name = "/%s/base/" % day
|
|
model_path = output_path.rstrip("/") + suffix_name
|
|
if donefile_name is None:
|
|
donefile_name = "xbox_base_done.txt"
|
|
|
|
if isinstance(data_path, list):
|
|
data_path = ",".join(data_path)
|
|
|
|
if fleet.worker_index() == 0:
|
|
donefile_path = output_path + "/" + donefile_name
|
|
xbox_str = self._get_xbox_str(output_path, day, model_path, \
|
|
xbox_base_key, data_path, hadoop_fs_name, monitor_data={},
|
|
mode=mode)
|
|
configs = {
|
|
"fs.default.name": hadoop_fs_name,
|
|
"hadoop.job.ugi": hadoop_fs_ugi
|
|
}
|
|
client = HDFSClient(hadoop_home, configs)
|
|
if client.is_file(donefile_path):
|
|
pre_content = client.cat(donefile_path)
|
|
last_dict = json.loads(pre_content.split("\n")[-1])
|
|
last_day = last_dict["input"].split("/")[-3]
|
|
last_pass = last_dict["input"].split("/")[-2].split("-")[-1]
|
|
exist = False
|
|
if int(day) < int(last_day) or \
|
|
int(day) == int(last_day) and \
|
|
int(pass_id) <= int(last_pass):
|
|
exist = True
|
|
if not exist:
|
|
with open(donefile_name, "w") as f:
|
|
f.write(pre_content + "\n")
|
|
f.write(xbox_str + "\n")
|
|
client.delete(donefile_path)
|
|
client.upload(
|
|
output_path,
|
|
donefile_name,
|
|
multi_processes=1,
|
|
overwrite=False)
|
|
self.rank0_error("write %s/%s %s succeed" % \
|
|
(day, pass_id, donefile_name))
|
|
else:
|
|
self.rank0_error("not write %s because %s/%s already "
|
|
"exists" % (donefile_name, day, pass_id))
|
|
else:
|
|
with open(donefile_name, "w") as f:
|
|
f.write(xbox_str + "\n")
|
|
client.upload(
|
|
output_path,
|
|
donefile_name,
|
|
multi_processes=1,
|
|
overwrite=False)
|
|
self.rank0_error("write %s/%s %s succeed" % \
|
|
(day, pass_id, donefile_name))
|
|
fleet._role_maker._barrier_worker()
|
|
|
|
def write_cache_donefile(self,
|
|
output_path,
|
|
day,
|
|
pass_id,
|
|
key_num,
|
|
hadoop_fs_name,
|
|
hadoop_fs_ugi,
|
|
hadoop_home="$HADOOP_HOME",
|
|
donefile_name="sparse_cache.meta",
|
|
**kwargs):
|
|
"""
|
|
write cache donefile
|
|
|
|
Args:
|
|
output_path(str): output path
|
|
day(str|int): training day of model
|
|
pass_id(str|int): training pass id of model
|
|
key_num(str|int): save cache return value
|
|
hadoop_fs_name(str): hdfs/afs fs name
|
|
hadoop_fs_ugi(str): hdfs/afs fs ugi
|
|
hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
|
|
donefile_name(str): donefile name, default is "sparse_cache.meta"
|
|
kwargs(dict): user defined properties
|
|
file_num(int): cache file num
|
|
table_id(int): cache table id
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.write_cache_donefile(
|
|
output_path="hdfs:/my/output/",
|
|
day=20190722,
|
|
pass_id=1,
|
|
key_num=123456,
|
|
hadoop_fs_name="hdfs://xxx",
|
|
hadoop_fs_ugi="user,passwd",
|
|
)
|
|
|
|
"""
|
|
day = str(day)
|
|
pass_id = str(pass_id)
|
|
key_num = int(key_num)
|
|
file_num = kwargs.get("file_num", 16)
|
|
table_id = kwargs.get("table_id", 0)
|
|
|
|
if pass_id != "-1":
|
|
suffix_name = "/%s/delta-%s/%03d_cache" % (day, pass_id, table_id)
|
|
model_path = output_path.rstrip("/") + suffix_name
|
|
else:
|
|
suffix_name = "/%s/base/%03d_cache" % (day, table_id)
|
|
model_path = output_path.rstrip("/") + suffix_name
|
|
|
|
if fleet.worker_index() == 0:
|
|
donefile_path = model_path + "/" + donefile_name
|
|
configs = {
|
|
"fs.default.name": hadoop_fs_name,
|
|
"hadoop.job.ugi": hadoop_fs_ugi
|
|
}
|
|
client = HDFSClient(hadoop_home, configs)
|
|
if client.is_file(donefile_path):
|
|
self.rank0_error( \
|
|
"not write because %s already exists" % donefile_path)
|
|
else:
|
|
meta_str = "file_prefix:part\npart_num:%s\nkey_num:%d\n" \
|
|
% (file_num, key_num)
|
|
with open(donefile_name, "w") as f:
|
|
f.write(meta_str)
|
|
client.upload(
|
|
model_path,
|
|
donefile_name,
|
|
multi_processes=1,
|
|
overwrite=False)
|
|
self.rank0_error("write %s succeed" % donefile_path)
|
|
fleet._role_maker._barrier_worker()
|
|
|
|
def load_model(self, output_path, day, pass_id):
|
|
"""
|
|
load pslib model
|
|
|
|
Args:
|
|
output_path(str): output path
|
|
day(str|int): training day
|
|
pass_id(str|int): training pass id
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.load_model("hdfs:/my/path", 20190722, 88)
|
|
|
|
"""
|
|
day = str(day)
|
|
pass_id = str(pass_id)
|
|
suffix_name = "/%s/%s/" % (day, pass_id)
|
|
load_path = output_path + suffix_name
|
|
self.rank0_error("going to load_model %s" % load_path)
|
|
self.load_fleet_model(load_path)
|
|
self.rank0_error("load_model done")
|
|
|
|
def save_model(self, output_path, day, pass_id):
|
|
"""
|
|
save pslib model
|
|
|
|
Args:
|
|
output_path(str): output path
|
|
day(str|int): training day
|
|
pass_id(str|int): training pass id
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.save_model("hdfs:/my/path", 20190722, 88)
|
|
|
|
"""
|
|
day = str(day)
|
|
pass_id = str(pass_id)
|
|
suffix_name = "/%s/%s/" % (day, pass_id)
|
|
model_path = output_path + suffix_name
|
|
self.rank0_print("going to save_model %s" % model_path)
|
|
self.save_fleet_model(model_path)
|
|
self.rank0_print("save_model done")
|
|
|
|
def save_batch_model(self, output_path, day):
|
|
"""
|
|
save batch model
|
|
|
|
Args:
|
|
output_path(str): output path
|
|
day(str|int): training day
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.save_batch_model("hdfs:/my/path", 20190722)
|
|
|
|
"""
|
|
day = str(day)
|
|
suffix_name = "/%s/0/" % day
|
|
model_path = output_path + suffix_name
|
|
self.rank0_print("going to save_model %s" % model_path)
|
|
fleet.save_persistables(None, model_path, mode=3)
|
|
self.rank0_print("save_batch_model done")
|
|
|
|
def save_delta_model(self, output_path, day, pass_id):
|
|
"""
|
|
save delta model
|
|
|
|
Args:
|
|
output_path(str): output path
|
|
day(str|int): training day
|
|
pass_id(str|int): training pass id
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.save_batch_model("hdfs:/my/path", 20190722, 88)
|
|
|
|
"""
|
|
day = str(day)
|
|
pass_id = str(pass_id)
|
|
suffix_name = "/%s/delta-%s/" % (day, pass_id)
|
|
model_path = output_path + suffix_name
|
|
self.rank0_print("going to save_delta_model %s" % model_path)
|
|
fleet.save_persistables(None, model_path, mode=1)
|
|
self.rank0_print("save_delta_model done")
|
|
|
|
def save_xbox_base_model(self, output_path, day):
|
|
"""
|
|
save xbox base model
|
|
|
|
Args:
|
|
output_path(str): output path
|
|
day(str|int): training day
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.save_xbox_base_model("hdfs:/my/path", 20190722, 88)
|
|
|
|
"""
|
|
day = str(day)
|
|
suffix_name = "/%s/base/" % day
|
|
model_path = output_path + suffix_name
|
|
self.rank0_print("going to save_xbox_base_model " + model_path)
|
|
fleet.save_persistables(None, model_path, mode=2)
|
|
self.rank0_print("save_xbox_base_model done")
|
|
|
|
def save_cache_model(self, output_path, day, pass_id, mode=1, **kwargs):
|
|
"""
|
|
save cache model
|
|
|
|
Args:
|
|
output_path(str): output path
|
|
day(str|int): training day
|
|
pass_id(str|int): training pass id
|
|
mode(str|int): save mode
|
|
kwargs(dict): user defined properties
|
|
table_id(int): table id to save cache
|
|
|
|
Returns:
|
|
key_num(int): cache key num
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.save_cache_model("hdfs:/my/path", 20190722, 88)
|
|
|
|
"""
|
|
day = str(day)
|
|
pass_id = str(pass_id)
|
|
mode = int(mode)
|
|
table_id = kwargs.get("table_id", 0)
|
|
suffix_name = "/%s/delta-%s" % (day, pass_id)
|
|
model_path = output_path.rstrip("/") + suffix_name
|
|
self.rank0_print("going to save_cache_model %s" % model_path)
|
|
key_num = fleet.save_cache_model(
|
|
None, model_path, mode=mode, table_id=table_id)
|
|
self.rank0_print("save_cache_model done")
|
|
return key_num
|
|
|
|
def save_cache_base_model(self, output_path, day, **kwargs):
|
|
"""
|
|
save cache model
|
|
|
|
Args:
|
|
output_path(str): output path
|
|
day(str|int): training day
|
|
pass_id(str|int): training pass id
|
|
kwargs(dict): user defined properties
|
|
table_id(int): table id to save cache
|
|
|
|
Returns:
|
|
key_num(int): cache key num
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.save_cache_base_model("hdfs:/my/path", 20190722)
|
|
|
|
"""
|
|
day = str(day)
|
|
table_id = kwargs.get("table_id", 0)
|
|
suffix_name = "/%s/base" % day
|
|
model_path = output_path.rstrip("/") + suffix_name
|
|
self.rank0_print("going to save_cache_base_model %s" % model_path)
|
|
key_num = fleet.save_cache_model(
|
|
None, model_path, mode=2, table_id=table_id)
|
|
self.rank0_print("save_cache_base_model done")
|
|
return key_num
|
|
|
|
def pull_all_dense_params(self, scope, program):
|
|
"""
|
|
pull all dense params in trainer of rank 0
|
|
|
|
Args:
|
|
scope(Scope): fluid Scope
|
|
program(Program): fluid Program
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.pull_all_dense_params(my_scope, my_program)
|
|
|
|
"""
|
|
fleet._role_maker._barrier_worker()
|
|
if fleet._role_maker.is_first_worker():
|
|
prog_id = str(id(program))
|
|
tables = fleet._opt_info["program_id_to_worker"][prog_id].\
|
|
get_desc().dense_table
|
|
prog_conf = fleet._opt_info['program_configs'][prog_id]
|
|
prog_tables = {}
|
|
for key in prog_conf:
|
|
if "dense" not in key:
|
|
continue
|
|
for table_id in prog_conf[key]:
|
|
prog_tables[int(table_id)] = 0
|
|
for table in tables:
|
|
if int(table.table_id) not in prog_tables:
|
|
continue
|
|
var_name_list = []
|
|
for i in range(0, len(table.dense_variable_name)):
|
|
var_name = table.dense_variable_name[i]
|
|
if scope.find_var(var_name) is None:
|
|
raise ValueError("var " + var_name +
|
|
" not found in scope " +
|
|
"when pull dense")
|
|
var_name_list.append(var_name)
|
|
fleet._fleet_ptr.pull_dense(scope,
|
|
int(table.table_id), var_name_list)
|
|
fleet._role_maker._barrier_worker()
|
|
|
|
def save_paddle_inference_model(self,
|
|
executor,
|
|
scope,
|
|
program,
|
|
feeded_vars,
|
|
target_vars,
|
|
output_path,
|
|
day,
|
|
pass_id,
|
|
hadoop_fs_name,
|
|
hadoop_fs_ugi,
|
|
hadoop_home="$HADOOP_HOME",
|
|
save_combine=True):
|
|
"""
|
|
save paddle inference model, and upload to hdfs dnn_plugin path
|
|
|
|
Args:
|
|
executor(Executor): fluid Executor
|
|
scope(Scope): fluid Scope
|
|
program(Program): fluid Program
|
|
feeded_vars(list[Variable]): feed vars
|
|
target_vars(list[variable]): fetch vars
|
|
output_path(str): hdfs/afs output path
|
|
day(str|int): training day
|
|
pass_id(str|int): training pass
|
|
hadoop_fs_name(str): hadoop fs name
|
|
hadoop_fs_ugi(str): hadoop fs ugi
|
|
hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
|
|
save_combine(bool): whether to save in a file or separate files,
|
|
default is True
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.save_paddle_inference_model(exe,
|
|
join_scope,
|
|
join_program,
|
|
feeded_vars,
|
|
target_vars,
|
|
"hdfs:/my/output/path/",
|
|
day=20190727,
|
|
pass_id=6,
|
|
hadoop_fs_name="xxx",
|
|
hadoop_fs_ugi="xxx,xxx")
|
|
"""
|
|
day = str(day)
|
|
pass_id = str(pass_id)
|
|
feeded_var_names = [i.name for i in feeded_vars]
|
|
model_name = "inference_model"
|
|
# pull dense before save
|
|
self.pull_all_dense_params(scope, program)
|
|
if fleet.worker_index() == 0:
|
|
with fluid.scope_guard(scope):
|
|
if save_combine:
|
|
fluid.io.save_inference_model(
|
|
dirname=model_name,
|
|
feeded_var_names=feeded_var_names,
|
|
target_vars=target_vars,
|
|
executor=executor,
|
|
main_program=program.clone(),
|
|
params_filename="params")
|
|
else:
|
|
fluid.io.save_inference_model(
|
|
dirname=model_name,
|
|
feeded_var_names=feeded_var_names,
|
|
target_vars=target_vars,
|
|
executor=executor,
|
|
main_program=program.clone())
|
|
|
|
configs = {
|
|
"fs.default.name": hadoop_fs_name,
|
|
"hadoop.job.ugi": hadoop_fs_ugi
|
|
}
|
|
client = HDFSClient(hadoop_home, configs)
|
|
|
|
if pass_id == "-1":
|
|
dest = "%s/%s/base/dnn_plugin/" % (output_path, day)
|
|
else:
|
|
dest = "%s/%s/delta-%s/dnn_plugin/" % (output_path, day,
|
|
pass_id)
|
|
if not client.is_exist(dest):
|
|
client.makedirs(dest)
|
|
|
|
client.upload(dest, model_name)
|
|
|
|
fleet._role_maker._barrier_worker()
|
|
|
|
def save_paddle_params(self,
|
|
executor,
|
|
scope,
|
|
program,
|
|
model_name,
|
|
output_path,
|
|
day,
|
|
pass_id,
|
|
hadoop_fs_name,
|
|
hadoop_fs_ugi,
|
|
hadoop_home="$HADOOP_HOME",
|
|
var_names=None,
|
|
save_combine=True):
|
|
"""
|
|
save paddle model, and upload to hdfs dnn_plugin path
|
|
|
|
Args:
|
|
executor(Executor): fluid Executor
|
|
scope(Scope): fluid Scope
|
|
program(Program): fluid Program
|
|
model_name(str): save model local dir or filename
|
|
output_path(str): hdfs/afs output path
|
|
day(str|int): training day
|
|
pass_id(str|int): training pass
|
|
hadoop_fs_name(str): hadoop fs name
|
|
hadoop_fs_ugi(str): hadoop fs ugi
|
|
hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
|
|
var_names(list): save persistable var names, default is None
|
|
save_combine(bool): whether to save in a file or separate files,
|
|
default is True
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.save_paddle_params(exe,
|
|
join_scope,
|
|
join_program,
|
|
"paddle_dense.model.0",
|
|
"hdfs:/my/output/path/",
|
|
day=20190727,
|
|
pass_id=6,
|
|
hadoop_fs_name="xxx",
|
|
hadoop_fs_ugi="xxx,xxx",
|
|
var_names=join_all_var_names)
|
|
fleet_util.save_paddle_params(exe,
|
|
join_scope,
|
|
join_program,
|
|
"paddle_dense.model.usr.0",
|
|
"hdfs:/my/output/path/",
|
|
day=20190727,
|
|
pass_id=6,
|
|
hadoop_fs_name="xxx",
|
|
hadoop_fs_ugi="xxx,xxx",
|
|
var_names=join_user_var_names)
|
|
fleet_util.save_paddle_params(exe,
|
|
join_scope,
|
|
join_program,
|
|
"paddle_dense.model.item.0",
|
|
"hdfs:/my/output/path/",
|
|
day=20190727,
|
|
pass_id=6,
|
|
hadoop_fs_name="xxx",
|
|
hadoop_fs_ugi="xxx,xxx",
|
|
var_names=join_user_item_names)
|
|
|
|
"""
|
|
day = str(day)
|
|
pass_id = str(pass_id)
|
|
# pull dense before save
|
|
self.pull_all_dense_params(scope, program)
|
|
if fleet.worker_index() == 0:
|
|
vars = [program.global_block().var(i) for i in var_names]
|
|
with fluid.scope_guard(scope):
|
|
if save_combine:
|
|
fluid.io.save_vars(
|
|
executor, "./", program, vars=vars, filename=model_name)
|
|
else:
|
|
fluid.io.save_vars(executor, model_name, program, vars=vars)
|
|
|
|
configs = {
|
|
"fs.default.name": hadoop_fs_name,
|
|
"hadoop.job.ugi": hadoop_fs_ugi
|
|
}
|
|
client = HDFSClient(hadoop_home, configs)
|
|
|
|
if pass_id == "-1":
|
|
dest = "%s/%s/base/dnn_plugin/" % (output_path, day)
|
|
else:
|
|
dest = "%s/%s/delta-%s/dnn_plugin/" % (output_path, day,
|
|
pass_id)
|
|
if not client.is_exist(dest):
|
|
client.makedirs(dest)
|
|
|
|
if os.path.isdir(model_name):
|
|
client.upload_dir(dest, model_name)
|
|
else:
|
|
client.upload(dest, model_name)
|
|
|
|
fleet._role_maker._barrier_worker()
|
|
|
|
def get_last_save_xbox_base(self,
|
|
output_path,
|
|
hadoop_fs_name,
|
|
hadoop_fs_ugi,
|
|
hadoop_home="$HADOOP_HOME"):
|
|
"""
|
|
get last saved base xbox info from xbox_base_done.txt
|
|
|
|
Args:
|
|
output_path(str): output path
|
|
hadoop_fs_name(str): hdfs/afs fs_name
|
|
hadoop_fs_ugi(str): hdfs/afs fs_ugi
|
|
hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
|
|
|
|
Returns:
|
|
[last_save_day, last_path, xbox_base_key]
|
|
last_save_day(int): day of saved model
|
|
last_path(str): model path
|
|
xbox_base_key(int): xbox key
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
last_save_day, last_path, xbox_base_key = \
|
|
fleet_util.get_last_save_xbox_base("hdfs:/my/path", 20190722,
|
|
88)
|
|
|
|
"""
|
|
donefile_path = output_path + "/xbox_base_done.txt"
|
|
configs = {
|
|
"fs.default.name": hadoop_fs_name,
|
|
"hadoop.job.ugi": hadoop_fs_ugi
|
|
}
|
|
client = HDFSClient(hadoop_home, configs)
|
|
if not client.is_file(donefile_path):
|
|
return [-1, -1, int(time.time())]
|
|
pre_content = client.cat(donefile_path)
|
|
last_dict = json.loads(pre_content.split("\n")[-1])
|
|
last_day = int(last_dict["input"].split("/")[-3])
|
|
last_path = "/".join(last_dict["input"].split("/")[:-1])
|
|
xbox_base_key = int(last_dict["key"])
|
|
return [last_day, last_path, xbox_base_key]
|
|
|
|
def get_last_save_xbox(self,
|
|
output_path,
|
|
hadoop_fs_name,
|
|
hadoop_fs_ugi,
|
|
hadoop_home="$HADOOP_HOME"):
|
|
"""
|
|
get last saved xbox info from xbox_patch_done.txt
|
|
|
|
Args:
|
|
output_path(str): output path
|
|
hadoop_fs_name(str): hdfs/afs fs_name
|
|
hadoop_fs_ugi(str): hdfs/afs fs_ugi
|
|
hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
|
|
|
|
Returns:
|
|
[last_save_day, last_save_pass, last_path, xbox_base_key]
|
|
last_save_day(int): day of saved model
|
|
last_save_pass(int): pass id of saved
|
|
last_path(str): model path
|
|
xbox_base_key(int): xbox key
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
last_save_day, last_save_pass, last_path, xbox_base_key = \
|
|
fleet_util.get_last_save_xbox("hdfs:/my/path", 20190722, 88)
|
|
|
|
"""
|
|
donefile_path = output_path + "/xbox_patch_done.txt"
|
|
configs = {
|
|
"fs.default.name": hadoop_fs_name,
|
|
"hadoop.job.ugi": hadoop_fs_ugi
|
|
}
|
|
client = HDFSClient(hadoop_home, configs)
|
|
if not client.is_file(donefile_path):
|
|
return [-1, -1, "", int(time.time())]
|
|
pre_content = client.cat(donefile_path)
|
|
last_dict = json.loads(pre_content.split("\n")[-1])
|
|
last_day = int(last_dict["input"].split("/")[-3])
|
|
last_pass = int(last_dict["input"].split("/")[-2].split("-")[-1])
|
|
last_path = "/".join(last_dict["input"].split("/")[:-1])
|
|
xbox_base_key = int(last_dict["key"])
|
|
return [last_day, last_pass, last_path, xbox_base_key]
|
|
|
|
def get_last_save_model(self,
|
|
output_path,
|
|
hadoop_fs_name,
|
|
hadoop_fs_ugi,
|
|
hadoop_home="$HADOOP_HOME"):
|
|
"""
|
|
get last saved model info from donefile.txt
|
|
|
|
Args:
|
|
output_path(str): output path
|
|
hadoop_fs_name(str): hdfs/afs fs_name
|
|
hadoop_fs_ugi(str): hdfs/afs fs_ugi
|
|
hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
|
|
|
|
Returns:
|
|
[last_save_day, last_save_pass, last_path, xbox_base_key]
|
|
last_save_day(int): day of saved model
|
|
last_save_pass(int): pass id of saved
|
|
last_path(str): model path
|
|
xbox_base_key(int): xbox key
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
last_save_day, last_save_pass, last_path, xbox_base_key = \
|
|
fleet_util.get_last_save_model("hdfs:/my/path", 20190722, 88)
|
|
|
|
"""
|
|
last_save_day = -1
|
|
last_save_pass = -1
|
|
last_path = ""
|
|
donefile_path = output_path + "/donefile.txt"
|
|
configs = {
|
|
"fs.default.name": hadoop_fs_name,
|
|
"hadoop.job.ugi": hadoop_fs_ugi
|
|
}
|
|
client = HDFSClient(hadoop_home, configs)
|
|
if not client.is_file(donefile_path):
|
|
return [-1, -1, "", int(time.time())]
|
|
content = client.cat(donefile_path)
|
|
content = content.split("\n")[-1].split("\t")
|
|
last_save_day = int(content[0])
|
|
last_save_pass = int(content[3])
|
|
last_path = content[2]
|
|
xbox_base_key = int(content[1])
|
|
return [last_save_day, last_save_pass, last_path, xbox_base_key]
|
|
|
|
def get_online_pass_interval(self, days, hours, split_interval,
|
|
split_per_pass, is_data_hourly_placed):
|
|
"""
|
|
get online pass interval
|
|
|
|
Args:
|
|
days(str): days to train
|
|
hours(str): hours to train
|
|
split_interval(int|str): split interval
|
|
split_per_pass(int}str): split per pass
|
|
is_data_hourly_placed(bool): is data hourly placed
|
|
|
|
Returns:
|
|
online_pass_interval(list)
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
online_pass_interval = fleet_util.get_online_pass_interval(
|
|
days="{20190720..20190729}",
|
|
hours="{0..23}",
|
|
split_interval=5,
|
|
split_per_pass=2,
|
|
is_data_hourly_placed=False)
|
|
|
|
"""
|
|
days = os.popen("echo -n " + days).read().split(" ")
|
|
hours = os.popen("echo -n " + hours).read().split(" ")
|
|
split_interval = int(split_interval)
|
|
split_per_pass = int(split_per_pass)
|
|
splits_per_day = 24 * 60 / split_interval
|
|
pass_per_day = splits_per_day / split_per_pass
|
|
left_train_hour = int(hours[0])
|
|
right_train_hour = int(hours[-1])
|
|
|
|
start = 0
|
|
split_path = []
|
|
for i in range(splits_per_day):
|
|
h = start / 60
|
|
m = start % 60
|
|
if h < left_train_hour or h > right_train_hour:
|
|
start += split_interval
|
|
continue
|
|
if is_data_hourly_placed:
|
|
split_path.append("%02d" % h)
|
|
else:
|
|
split_path.append("%02d%02d" % (h, m))
|
|
start += split_interval
|
|
|
|
start = 0
|
|
online_pass_interval = []
|
|
for i in range(pass_per_day):
|
|
online_pass_interval.append([])
|
|
for j in range(start, start + split_per_pass):
|
|
online_pass_interval[i].append(split_path[j])
|
|
start += split_per_pass
|
|
|
|
return online_pass_interval
|
|
|
|
def get_global_metrics(self,
|
|
scope=fluid.global_scope(),
|
|
stat_pos_name="_generated_var_2",
|
|
stat_neg_name="_generated_var_3",
|
|
sqrerr_name="sqrerr",
|
|
abserr_name="abserr",
|
|
prob_name="prob",
|
|
q_name="q",
|
|
pos_ins_num_name="pos",
|
|
total_ins_num_name="total"):
|
|
"""
|
|
get global metrics, including auc, bucket_error, mae, rmse,
|
|
actual_ctr, predicted_ctr, copc, mean_predict_qvalue, total_ins_num.
|
|
|
|
Args:
|
|
scope(Scope): Scope object, default is fluid.global_scope()
|
|
stat_pos_name(str): name of auc pos bucket Variable
|
|
stat_neg_name(str): name of auc neg bucket Variable
|
|
sqrerr_name(str): name of sqrerr Variable
|
|
abserr_name(str): name of abserr Variable
|
|
prob_name(str): name of prob Variable
|
|
q_name(str): name of q Variable
|
|
pos_ins_num_name(str): name of pos ins num Variable
|
|
total_ins_num_name(str): name of total ins num Variable
|
|
|
|
Returns:
|
|
[auc, bucket_error, mae, rmse, actual_ctr, predicted_ctr, copc,
|
|
mean_predict_qvalue, total_ins_num]
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
metric_list = fleet_util.get_global_metrics(myscope,
|
|
stat_pos.name,
|
|
stat_neg.name,
|
|
local_sqrerr.name,
|
|
local_abserr.name,
|
|
local_prob.name,
|
|
local_q.name,
|
|
local_pos_ins.name,
|
|
local_total_ins.name)
|
|
|
|
# below is part of example model
|
|
label = fluid.layers.data(name="click", shape=[-1, 1],\
|
|
dtype="int64", lod_level=0, append_batch_size=False)
|
|
emb = my_slot_net(slots, label) # emb can be fc layer of size 1
|
|
similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
|
|
emb, min=-15.0, max=15.0), name="similarity_norm")\
|
|
binary_predict = fluid.layers.concat(input=[\
|
|
fluid.layers.elementwise_sub(\
|
|
fluid.layers.ceil(similarity_norm), similarity_norm),\
|
|
similarity_norm], axis=1)
|
|
auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, \
|
|
stat_neg] = fluid.layers.auc(input=binary_predict,\
|
|
label=label, curve='ROC',\
|
|
num_thresholds=4096)
|
|
local_sqrerr, local_abserr, local_prob, local_q, local_pos_ins,\
|
|
local_total_ins = fluid.contrib.layers.ctr_metric_bundle(\
|
|
similarity_norm, label)
|
|
|
|
"""
|
|
if scope.find_var(stat_pos_name) is None or \
|
|
scope.find_var(stat_neg_name) is None:
|
|
self.rank0_print("not found auc bucket")
|
|
return [None] * 9
|
|
elif scope.find_var(sqrerr_name) is None:
|
|
self.rank0_print("not found sqrerr_name=%s" % sqrerr_name)
|
|
return [None] * 9
|
|
elif scope.find_var(abserr_name) is None:
|
|
self.rank0_print("not found abserr_name=%s" % abserr_name)
|
|
return [None] * 9
|
|
elif scope.find_var(prob_name) is None:
|
|
self.rank0_print("not found prob_name=%s" % prob_name)
|
|
return [None] * 9
|
|
elif scope.find_var(q_name) is None:
|
|
self.rank0_print("not found q_name=%s" % q_name)
|
|
return [None] * 9
|
|
elif scope.find_var(pos_ins_num_name) is None:
|
|
self.rank0_print("not found pos_ins_num_name=%s" % pos_ins_num_name)
|
|
return [None] * 9
|
|
elif scope.find_var(total_ins_num_name) is None:
|
|
self.rank0_print("not found total_ins_num_name=%s" % \
|
|
total_ins_num_name)
|
|
return [None] * 9
|
|
|
|
# barrier worker to ensure all workers finished training
|
|
fleet._role_maker._barrier_worker()
|
|
|
|
# get auc
|
|
auc = self.get_global_auc(scope, stat_pos_name, stat_neg_name)
|
|
pos = np.array(scope.find_var(stat_pos_name).get_tensor())
|
|
# auc pos bucket shape
|
|
old_pos_shape = np.array(pos.shape)
|
|
# reshape to one dim
|
|
pos = pos.reshape(-1)
|
|
global_pos = np.copy(pos) * 0
|
|
# mpi allreduce
|
|
fleet._role_maker._all_reduce(pos, global_pos)
|
|
# reshape to its original shape
|
|
global_pos = global_pos.reshape(old_pos_shape)
|
|
# auc neg bucket
|
|
neg = np.array(scope.find_var(stat_neg_name).get_tensor())
|
|
old_neg_shape = np.array(neg.shape)
|
|
neg = neg.reshape(-1)
|
|
global_neg = np.copy(neg) * 0
|
|
fleet._role_maker._all_reduce(neg, global_neg)
|
|
global_neg = global_neg.reshape(old_neg_shape)
|
|
|
|
num_bucket = len(global_pos[0])
|
|
|
|
def get_metric(name):
|
|
metric = np.array(scope.find_var(name).get_tensor())
|
|
old_metric_shape = np.array(metric.shape)
|
|
metric = metric.reshape(-1)
|
|
global_metric = np.copy(metric) * 0
|
|
fleet._role_maker._all_reduce(metric, global_metric)
|
|
global_metric = global_metric.reshape(old_metric_shape)
|
|
return global_metric[0]
|
|
|
|
global_sqrerr = get_metric(sqrerr_name)
|
|
global_abserr = get_metric(abserr_name)
|
|
global_prob = get_metric(prob_name)
|
|
global_q_value = get_metric(q_name)
|
|
# note: get ins_num from auc bucket is not actual value,
|
|
# so get it from metric op
|
|
pos_ins_num = get_metric(pos_ins_num_name)
|
|
total_ins_num = get_metric(total_ins_num_name)
|
|
neg_ins_num = total_ins_num - pos_ins_num
|
|
|
|
mae = global_abserr / total_ins_num
|
|
rmse = math.sqrt(global_sqrerr / total_ins_num)
|
|
return_actual_ctr = pos_ins_num / total_ins_num
|
|
predicted_ctr = global_prob / total_ins_num
|
|
mean_predict_qvalue = global_q_value / total_ins_num
|
|
copc = 0.0
|
|
if abs(predicted_ctr > 1e-6):
|
|
copc = return_actual_ctr / predicted_ctr
|
|
|
|
# calculate bucket error
|
|
last_ctr = -1.0
|
|
impression_sum = 0.0
|
|
ctr_sum = 0.0
|
|
click_sum = 0.0
|
|
error_sum = 0.0
|
|
error_count = 0.0
|
|
click = 0.0
|
|
show = 0.0
|
|
ctr = 0.0
|
|
adjust_ctr = 0.0
|
|
relative_error = 0.0
|
|
actual_ctr = 0.0
|
|
relative_ctr_error = 0.0
|
|
k_max_span = 0.01
|
|
k_relative_error_bound = 0.05
|
|
for i in xrange(num_bucket):
|
|
click = global_pos[0][i]
|
|
show = global_pos[0][i] + global_neg[0][i]
|
|
ctr = float(i) / num_bucket
|
|
if abs(ctr - last_ctr) > k_max_span:
|
|
last_ctr = ctr
|
|
impression_sum = 0.0
|
|
ctr_sum = 0.0
|
|
click_sum = 0.0
|
|
impression_sum += show
|
|
ctr_sum += ctr * show
|
|
click_sum += click
|
|
if impression_sum == 0:
|
|
continue
|
|
adjust_ctr = ctr_sum / impression_sum
|
|
if adjust_ctr == 0:
|
|
continue
|
|
relative_error = \
|
|
math.sqrt((1 - adjust_ctr) / (adjust_ctr * impression_sum))
|
|
if relative_error < k_relative_error_bound:
|
|
actual_ctr = click_sum / impression_sum
|
|
relative_ctr_error = abs(actual_ctr / adjust_ctr - 1)
|
|
error_sum += relative_ctr_error * impression_sum
|
|
error_count += impression_sum
|
|
last_ctr = -1
|
|
|
|
bucket_error = error_sum / error_count if error_count > 0 else 0.0
|
|
|
|
return [
|
|
auc, bucket_error, mae, rmse, return_actual_ctr, predicted_ctr,
|
|
copc, mean_predict_qvalue, int(total_ins_num)
|
|
]
|
|
|
|
def print_global_metrics(self,
|
|
scope=fluid.global_scope(),
|
|
stat_pos_name="_generated_var_2",
|
|
stat_neg_name="_generated_var_3",
|
|
sqrerr_name="sqrerr",
|
|
abserr_name="abserr",
|
|
prob_name="prob",
|
|
q_name="q",
|
|
pos_ins_num_name="pos",
|
|
total_ins_num_name="total",
|
|
print_prefix=""):
|
|
"""
|
|
print global metrics, including auc, bucket_error, mae, rmse,
|
|
actual_ctr, predicted_ctr, copc, mean_predict_qvalue, total_ins_num.
|
|
|
|
Args:
|
|
scope(Scope): Scope object, default is fluid.global_scope()
|
|
stat_pos_name(str): name of auc pos bucket Variable
|
|
stat_neg_name(str): name of auc neg bucket Variable
|
|
sqrerr_name(str): name of sqrerr Variable
|
|
abserr_name(str): name of abserr Variable
|
|
prob_name(str): name of prob Variable
|
|
q_name(str): name of q Variable
|
|
pos_ins_num_name(str): name of pos ins num Variable
|
|
total_ins_num_name(str): name of total ins num Variable
|
|
print_prefix(str): print prefix
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
fleet_util.print_global_metrics(myscope,
|
|
stat_pos.name,
|
|
stat_neg.name,
|
|
local_sqrerr.name,
|
|
local_abserr.name,
|
|
local_prob.name,
|
|
local_q.name,
|
|
local_pos_ins.name,
|
|
local_total_ins.name)
|
|
|
|
# below is part of model
|
|
label = fluid.layers.data(name="click", shape=[-1, 1],\
|
|
dtype="int64", lod_level=0, append_batch_size=False)
|
|
emb = my_slot_net(slots, label) # emb can be fc layer of size 1
|
|
similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
|
|
emb, min=-15.0, max=15.0), name="similarity_norm")\
|
|
binary_predict = fluid.layers.concat(input=[\
|
|
fluid.layers.elementwise_sub(\
|
|
fluid.layers.ceil(similarity_norm), similarity_norm),\
|
|
similarity_norm], axis=1)
|
|
auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, \
|
|
stat_neg] = fluid.layers.auc(input=binary_predict,\
|
|
label=label, curve='ROC',\
|
|
num_thresholds=4096)
|
|
local_sqrerr, local_abserr, local_prob, local_q, local_pos_ins, \
|
|
local_total_ins = fluid.contrib.layers.ctr_metric_bundle(\
|
|
similarity_norm, label)
|
|
|
|
"""
|
|
if scope.find_var(stat_pos_name) is None or \
|
|
scope.find_var(stat_neg_name) is None:
|
|
self.rank0_print("not found auc bucket")
|
|
return
|
|
elif scope.find_var(sqrerr_name) is None:
|
|
self.rank0_print("not found sqrerr_name=%s" % sqrerr_name)
|
|
return
|
|
elif scope.find_var(abserr_name) is None:
|
|
self.rank0_print("not found abserr_name=%s" % abserr_name)
|
|
return
|
|
elif scope.find_var(prob_name) is None:
|
|
self.rank0_print("not found prob_name=%s" % prob_name)
|
|
return
|
|
elif scope.find_var(q_name) is None:
|
|
self.rank0_print("not found q_name=%s" % q_name)
|
|
return
|
|
elif scope.find_var(pos_ins_num_name) is None:
|
|
self.rank0_print("not found pos_ins_num_name=%s" % pos_ins_num_name)
|
|
return
|
|
elif scope.find_var(total_ins_num_name) is None:
|
|
self.rank0_print("not found total_ins_num_name=%s" % \
|
|
total_ins_num_name)
|
|
return
|
|
|
|
auc, bucket_error, mae, rmse, actual_ctr, predicted_ctr, copc,\
|
|
mean_predict_qvalue, total_ins_num = self.get_global_metrics(\
|
|
scope, stat_pos_name, stat_neg_name, sqrerr_name, abserr_name,\
|
|
prob_name, q_name, pos_ins_num_name, total_ins_num_name)
|
|
self.rank0_print("%s global AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f "
|
|
"RMSE=%.6f Actural_CTR=%.6f Predicted_CTR=%.6f "
|
|
"COPC=%.6f MEAN Q_VALUE=%.6f Ins number=%s" %
|
|
(print_prefix, auc, bucket_error, mae, rmse,
|
|
actual_ctr, predicted_ctr, copc, mean_predict_qvalue,
|
|
total_ins_num))
|
|
|
|
def program_type_trans(self, prog_dir, prog_fn, is_text):
|
|
return utils.program_type_trans(prog_dir, prog_fn, is_text)
|
|
|
|
def draw_from_program_file(self, model_filename, is_text, output_dir,
|
|
output_filename):
|
|
"""draw program from file"""
|
|
program = utils.load_program(model_filename, is_text)
|
|
utils.graphviz(program.global_block(), output_dir, output_filename)
|
|
|
|
def draw_from_program(self, program, output_dir, output_name):
|
|
"""draw Program"""
|
|
utils.graphviz(program.global_block(), output_dir, output_name)
|
|
|
|
def check_two_programs(self, config):
|
|
train_prog = utils.load_program(config.train_prog_path,
|
|
config.is_text_train_program)
|
|
pruned_prog = utils.load_program(config.pruned_prog_path,
|
|
config.is_text_pruned_program)
|
|
if config.draw:
|
|
pruned_dir = os.path.dirname(config.pruned_prog_path)
|
|
self.draw_from_program(pruned_prog, pruned_dir,
|
|
config.draw_out_name)
|
|
res = utils.check_pruned_program_vars(train_prog, pruned_prog)
|
|
if res:
|
|
_logger.info("check_programs succeed.")
|
|
else:
|
|
_logger.info(
|
|
"check_programs failed. pruned program and train program not match!"
|
|
)
|
|
return res
|
|
|
|
def check_vars_and_dump(self, config):
|
|
_logger.info("start check_vars_and_dump.")
|
|
results = utils.check_saved_vars_try_dump(
|
|
config.dump_model_dir, config.dump_program_filename,
|
|
config.is_text_dump_program, config.feed_config,
|
|
config.fetch_config, config.batch_size, config.save_params_filename)
|
|
_logger.info("check_vars_and_dump succeed.")
|
|
return results
|
|
|
|
def parse_program_proto(self, prog_path, is_text, output_dir):
|
|
"""
|
|
Parse program.proto into a more readable format.
|
|
This function will generate three files:
|
|
output_dir/vars_all.log,
|
|
output_dir/vars_persistable.log,
|
|
output_dir/ops.log.
|
|
|
|
Args:
|
|
prog_path(str): proto file path to be parsed.
|
|
is_text(bool): proto file is human-readale format or not(binary).
|
|
output_dir(str): output dir.
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
|
|
fleet_util = FleetUtil()
|
|
program_path = "./program.pbtxt"
|
|
is_text = True
|
|
output_dir = "/tmp/"
|
|
fleet_util.parse_program_proto(program_path, is_text, output_dir)
|
|
"""
|
|
program = utils.load_program(prog_path, is_text)
|
|
utils.parse_program(program, output_dir)
|